def run_online_classifier(): vect = HashingVectorizer( decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer_streaming, ) clf = SGDClassifier(loss='log', random_state=1, n_iter=1) csv_filename = os.path.join('datasets', 'movie_data.csv') doc_stream = stream_docs(path=csv_filename) classes = np.array([0, 1]) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if X_train is None: break else: X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) X_test, y_test = get_minibatch(doc_stream, size=5000) X_test = vect.transform(X_test) print("Test accuracy: %.3f" % clf.score(X_test, y_test)) clf = clf.partial_fit(X_test, y_test)
def big_kmeans(docs, k, batch_size=1000, n_features=(2 ** 20), single_pass=True): """k-means for very large sets of documents. See kmeans for documentation. Differs from that function in that it does not computer tf-idf or LSA, and fetches the documents in a streaming fashion, so they don't need to be held in memory. It does not do random restarts. If the option single_pass is set to False, the documents are visited twice: once to fit a k-means model, once to determine their label in this model. """ from sklearn.cluster import MiniBatchKMeans from sklearn.feature_extraction.text import HashingVectorizer v = HashingVectorizer(input="content", n_features=n_features, norm="l2") km = MiniBatchKMeans(n_clusters=k) labels = [] for batch in batches(docs, batch_size): batch = map(fetch, docs) batch = v.transform(batch) y = km.fit_predict(batch) if single_pass: labels.extend(y.tolist()) if not single_pass: for batch in batches(docs, batch_size): batch = map(fetch, docs) batch = v.transform(batch) labels.extend(km.predict(batch).tolist()) return labels
def get_hashing(data): t0 = time.time() print("* Making hashing vectorizor with the data ...") hasher = HashingVectorizer(stop_words='english', ngram_range=(1,3), norm='l2', non_negative=True) #l2 projected on the euclidean unit sphere hX = hasher.fit_transform(data) print("done in %0.3fs." % (time.time() - t0)) return hX, hasher
def main(): ''' >>> main() # stuff happens ''' args = parse_args() setup_logging(args.log, verbose=args.verbose) chunks = sequence_chunk_generator(args.fasta_file, chunk_size=args.chunk_size) hasher = HashingVectorizer(analyzer='char', n_features = 2 ** 18, ngram_range=(args.ngram_min, args.ngram_max), ) estimator = AffinityPropagation() for chunk in chunks: logging.info('hashing chunk') chunk_vector = hasher.transform([ str(i.seq) for i in chunk ]) logging.info('clustering') estimator.fit(chunk_vector) logging.info('got %s clusters' % len(set(estimator.labels_)))
def train(): vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, ngram_range=(1, 3), tokenizer=tokenizer) clf = SGDClassifier(loss='log', random_state=1, n_iter=1) stream_path = os.path.join(work_path, 'movie_data.csv') doc_stream = stream_docs(path=stream_path) pbar = pyprind.ProgBar(45) classes = np.array([0, 1]) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if not X_train: break X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) pbar.update() X_test, y_test = get_minibatch(doc_stream, size=5000) X_test = vect.transform(X_test) print('Accuracy: %.3f' % clf.score(X_test, y_test)) clf = clf.partial_fit(X_test, y_test) return clf
def sim_char10(text1, text2): vect = HashingVectorizer(analyzer='char_wb', tokenizer=normalize, stop_words='english', ngram_range=(10, 10)) texts = [text1, text2] matrix = vect.fit_transform(texts) cosine_similarities = linear_kernel(matrix[0:1], matrix).flatten() simmax = max(cosine_similarities[1:]) return simmax
def sim_char5(text1, text2): vect = HashingVectorizer(analyzer='word', tokenizer=normalize, stop_words='english') texts = [text1, text2] matrix = vect.transform(texts) cosine_similarities = linear_kernel(matrix[0:1], matrix).flatten() simmax = max(cosine_similarities[1:]) return simmax
def __init__(self, train_data, C=5, kernel='poly', gamma=.001, degree=10, coef0=2, n_features=10000000, ngram_range=(1, 10), tfidf=False, dfrange=(2, 1.0), probability=False, class_weight=None): self.conn = None self.is_tfidf = tfidf if tfidf: self.vectorizer = TfidfVectorizer(stop_words=None, min_df=dfrange[0], max_df=dfrange[1], max_features=n_features, strip_accents='unicode', ngram_range=ngram_range, analyzer='word', norm='l2') else: self.vectorizer = HashingVectorizer(stop_words=None, non_negative=True, n_features=n_features, strip_accents='unicode', ngram_range=ngram_range, analyzer='word', norm='l2') self.param_set = {'C': str(C), 'kernel': str(kernel), 'gamma': str(gamma), 'degree': str(degree), 'coef0': str(coef0), 'n_features': str(n_features)} if class_weight == 'auto': class_weight = {} for item in train_data.target: if class_weight.get(item): class_weight.update({item: class_weight[item] + 1.0}) else: class_weight.update({item: 1.0}) for key in class_weight: class_weight.update({key: 1.0 / class_weight[key]}) self.class_weight_dict = class_weight super(svm_text, self).__init__(C=C, kernel=kernel, gamma=gamma, shrinking=True, probability=probability, degree=degree, coef0=coef0, tol=0.001, cache_size=20000, class_weight=class_weight, verbose=False, max_iter=-1) if self.is_tfidf: train_x = self.vectorizer.fit_transform(train_data.data) else: train_x = self.vectorizer.transform(train_data.data) self.fit(train_x, train_data.target)
def tfidf_classify(user): train_set, y, src, test_set = extract_data(user.id) if not train_set: return [] # Analyse using tf-idf # vector = TfidfVectorizer(sublinear_tf=True, max_df=0.5) vector = HashingVectorizer(n_features=1000, non_negative=True, stop_words='english') # List of topic extracted from text # feature_names = vector.get_feature_names() # print feature_names xtrain = vector.transform(train_set) xtest = vector.transform(test_set) # Select sample using chi-square ch2 = SelectKBest(chi2) xtrain = ch2.fit_transform(xtrain, y) xtest = ch2.transform(xtest) # Predict testing set # classifier = DecisionTreeClassifier() classifier = KNeighborsClassifier(n_neighbors=4) classifier = classifier.fit(xtrain, y) result = classifier.predict(xtest) final = [] for i in xrange(len(result)): if result[i]: final.append(src[i]) print len(final) return final
def big_kmeans(docs, k, batch_size=1000, n_features=(2 ** 20), single_pass=True): """k-means for very large sets of documents. """ from sklearn.cluster import MiniBatchKMeans from sklearn.feature_extraction.text import HashingVectorizer v = HashingVectorizer(input="content", n_features=n_features, norm="l2") km = MiniBatchKMeans(n_clusters=k) labels = [] for batch in batches(docs, batch_size): batch = map(fetch, docs) batch = v.transform(batch) y = km.fit_predict(batch) if single_pass: labels.extend(y.tolist()) if not single_pass: for batch in batches(docs, batch_size): batch = map(fetch, docs) batch = v.transform(batch) labels.extend(km.predict(batch).tolist()) return labels
def test_same_output(self): X, X_rdd = self.make_text_rdd() local = HashingVectorizer() dist = SparkHashingVectorizer() result_local = local.transform(X).toarray() result_dist = dist.transform(X_rdd).toarray() assert_array_equal(result_local, result_dist)
def test_same_output(self): X, X_rdd = self.generate_text_dataset() local = HashingVectorizer() dist = SparkHashingVectorizer() result_local = local.transform(X) result_dist = sp.vstack(dist.transform(X_rdd).collect()) assert_array_equal(result_local.toarray(), result_dist.toarray())
def predict(line, tagger): tok_cn = lambda (x): crfseg.cut_zh(x, tagger) hasher = HashingVectorizer(n_features=2**16, tokenizer=tok_cn, non_negative=True, norm=None, binary=False) x_test = hasher.transform([line]) return clf_global.predict_proba(x_test)
class ReviewTrainer(TrainerModel): def __init__(self): pass #get rids of stopwords def preprocess(self, l): res = {} sw = stopwords.words('english') clean = ' '.join([w for w in l['text'].split() if w not in sw]) res[l['review_id']] = {'text' : clean, 'label' : l['votes']['useful']} return res #the labels are already given for this review def group_labels(self, fname): pass #vectorizes data and selects K best feats. def prepare_data(self, x, y): self.hv = HashingVectorizer(strip_accents='ascii', non_negative=True) self.feats = self.hv.transform(x) self.labels = np.array(y) self.ch2 = SelectKBest(chi2, k=K_FEAT) self.feats = self.ch2.fit_transform(self.feats, self.labels) def get_error(self, pred, y): return super(ReviewTrainer, self).get_error(pred,y) #optimizes for hyper-parameter alpha def _cross_validate(self): grid = dict(alpha=10.0 ** np.arange(-4,1)) return super(ReviewTrainer, self)._cross_validate_base( Ridge(), grid) #builds examples to feed trainer #MUST RUN BEFORE train def build_examples(self, data, labels=None): feats = [] labels = [] ex = {} for k,v in data.items(): feats.append(v['text']) labels.append(v['label']) ex['feats'] = feats ex['labels'] = labels return ex #fits model using optimal parameters def train(self): self.clf = self._cross_validate() self.clf.fit(self.feats, self.labels) #predicts Y given X def predict(self, data): data = self.hv.transform(data) data = self.ch2.transform(data) pred = self.clf.predict(data) return pred
def trainOnModel(x_VariableList, y_VariableList, testSetList, classifier, hashing=False, chi_squared=False): from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import HashingVectorizer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.linear_model import RidgeClassifier from sklearn.svm import LinearSVC from sklearn.linear_model import SGDClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.utils.extmath import density y_train = y_VariableList if hashing == True: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=2 ** 16) X_train = vectorizer.transform(x_VariableList) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(x_VariableList) X_test = vectorizer.transform(testSetList) if chi_squared == True: print("Extracting best features by a chi-squared test") ch2 = SelectKBest(chi2, k=2 * 16) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) classifierObject = "" print "Using :", classifier if classifier == "LinearSVC": classifierObject = LinearSVC(penalty='l2', dual=False, tol=1e-3) elif classifier == "PassiveAggressiveClassifier": classifierObject = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, loss='hinge', n_iter=50, n_jobs=1, random_state=None, shuffle=True, verbose=0, warm_start=False) elif classifier == "RidgeClassifier": classifierObject = RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, solver='lsqr', tol=0.01) elif classifier == "Perceptron": classifierObject = Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True, n_iter=50, n_jobs=1, penalty=None, random_state=0, shuffle=True, verbose=0, warm_start=False) elif classifier == "SGDClassifier": classifierObject = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False) classifierObject.fit(X_train, y_train) pred = classifierObject.predict(X_test) return pred[0]
def vectorize(train_words, test_words): # 停用词表 with open('dict/stopwords.txt', 'r') as f: stopwords = set([w.strip() for w in f]) v = HashingVectorizer(non_negative=True, stop_words=stopwords, n_features=30000) train_data = v.fit_transform(train_words) test_data = v.fit_transform(test_words) return train_data, test_data
def get_x(text,ngram_range): hash_vect_object = HashingVectorizer(ngram_range=ngram_range, stop_words="english", strip_accents="unicode") tfidf_transformer_object = TfidfTransformer(use_idf=True) x_train_counts = hash_vect_object.fit_transform(text) x_train_tfidf = tfidf_transformer_object.fit_transform(x_train_counts) return x_train_tfidf
class Featurizer: def __init__(self): self.vectorizer = HashingVectorizer(stop_words="english") def train_feature(self, examples): return self.vectorizer.fit_transform(examples) def test_feature(self, examples): return self.vectorizer.transform(examples)
def main(output=RESULTS): # change ROOT ID in config.py to your computer's path so that is writes to correct file # load and puts data and desired numpy format movies = load_balanced_movies(MOVIES_DATA, False) # True is for debugging data = pd.DataFrame(movies) pd.options.mode.chained_assignment = None # default='warn' ignore summaries = data[['summary']] summaries['summary'] = summaries['summary'].str.replace('[^\w\s]','').str.lower() ## cleans out puncutation and characters Y = np.array(data[['year']]) Y = np.ravel(Y) X = np.array(summaries['summary']) # standard CountVectorizer for bag of words # vectorizer = CountVectorizer() # X = vectorizer.fit_transform(X) # print "Old Shape Dim" # print X.shape # uses random projections to reduce dimensionality # transformer = random_projection.SparseRandomProjection() # X_new = transformer.fit_transform(X) # print "New Shape Dim" # print X_new.shape # perform vectorization and dim reduction using Hashing Vectorizer (counts # of times a word appears) vectorizer = HashingVectorizer(stop_words='english', n_features=80000) # uses 80,000 word instances as k X = vectorizer.transform(X) # instantiate scaling of data for preprocessing X = StandardScaler(with_mean=False).fit_transform(X) # splits training and test data equally xtrain, xtest, ytrain, ytest = train_test_split(X, Y) names = ["SGDClassifier", "Linear SVC", "SVC Kernel RBF", "PerceptronL1", "PerceptronL2", "Nearest Neighbors", "Ridge Classifier"] # classifiers = [ SGDClassifier(loss="hinge", penalty="l2"), LinearSVC(), SVC(kernel="rbf"), Perceptron(penalty='l1'), Perceptron(penalty='l2', n_iter=25), KNeighborsClassifier(), RidgeClassifier(), ] print "Calculating accuracies" # fits chosen classifier on training data for name, clf in zip(names, classifiers): print name clf.fit(xtrain, ytrain) print "Accuracy: %0.2f%%" % (100 * clf.score(xtest, ytest)) # Predict and score accuracy with open(output, "a+") as outputFile: # write results to file score = 100 * clf.score(xtest, ytest) outputFile.write("Ran classifier {} ".format(name) + '\n' " Achieved accuracy {} ".format(score) )
def vectorize(docs): """ 文档向量化 :param docs list: iterable over raw text documents :return: """ v = HashingVectorizer(tokenizer=comma_tokenizer, n_features=30000, non_negative=True) train_data = v.fit_transform(docs) return train_data
def ngrams_hashing_vectorizer(strings, n, n_features): """ Return the a disctionary with the count of every unique n-gram in the string. """ hv = HashingVectorizer(analyzer='char', ngram_range=(n, n), n_features=n_features, norm=None, alternate_sign=False) hash_matrix = hv.fit_transform(strings) return hash_matrix
def feature_extraction(self, test): """ function:特征提取 :param test: :return:训练特征,测试特征 """ train = self.load_train_set() vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=25000) fea_train = vectorizer.fit_transform(train) # 特征提取 fea_test = vectorizer.fit_transform(test) # 特征提取 return fea_train, fea_test
def feature_extraction(feature, target_name, df): vect = HashingVectorizer(decode_error='ignore', ngram_range=(1,2), n_features = 2**18, binary=True, norm="l2") le = preprocessing.LabelEncoder() # for multiple features replace this with http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html df[feature] = df[feature].fillna('') titles = vect.transform(df[feature]) X = titles #y = le.fit_transform(df[target_name]) y = df[target_name] return X, y
def vector_func_char(l): vectorizer = HashingVectorizer( analyzer="char", input="content", decode_error="ignore", strip_accents="ascii", ngram_range=(2, 2), n_features=524288, ) return str(l).split(" ")[0], vectorizer.fit_transform(str(l).replace(str(l).split(" ")[0], ""))
def get_x(text,ngram_range): hash_vect_object = HashingVectorizer(ngram_range=ngram_range, stop_words="english", strip_accents="unicode", token_pattern=r"(?u)\b[a-zA-Z_][a-zA-Z_]+\b") # tokens are character strings of 2 or more characters tfidf_transformer_object = TfidfTransformer(use_idf=True) x_train_counts = hash_vect_object.fit_transform(text) x_train_tfidf = tfidf_transformer_object.fit_transform(x_train_counts) return x_train_tfidf
def vector_func_word(l): vectorizer = HashingVectorizer( non_negative=True, stop_words="english", input="content", decode_error="ignore", strip_accents="ascii", n_features=262144, ) # return str(l).split(" ")[0],vectorizer.fit_transform(str(l).replace(str(l).split(" ")[0],"")) return vectorizer.fit_transform(l).shape
def tfidfVectorizeData(listOfSentences, useHashTable=False, nFeatures=100): if useHashTable: from sklearn.feature_extraction.text import HashingVectorizer vec = HashingVectorizer(stop_words='english', non_negative=True, n_features=nFeatures) X_noProcess = vec.transform(listOfSentences).toarray() else: from sklearn.feature_extraction.text import TfidfVectorizer vec = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_noProcess = vec.fit_transform(listOfSentences).toarray() return vec, X_noProcess
def vectorize_2(test_words): input_words = jieba.lcut(test_words[0]) print check_neg(input_words) # if len(jieba.lcut(test_words[0])) < 2: if len(jieba.lcut(test_words[0])) < 2: return None, False else: v = HashingVectorizer(tokenizer=comma_tokenizer, stop_words=stopwords, n_features=100000, non_negative=True) test_data = v.fit_transform(test_words) print test_data return test_data, check_neg(input_words)
def vectorize(concepts): """ This vectorizes a list or a string of concepts; the regular `vectorize` method is meant to vectorize text documents; it is trained for that kind of data and thus is inappropriate for concepts. So instead we just use a simple hashing vectorizer. """ h = HashingVectorizer(input='content', stop_words='english', norm=None, tokenizer=Tokenizer()) if type(concepts) is str: # Extract and return the vector for the single document. return h.transform([concepts]).toarray()[0] else: return h.transform(concepts)
def __init__ (self, corpus, classes, method): # Set up vectorizier if method == 'count': self.vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 3)) elif method == 'tfidf': self.vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 3)) elif method == 'hashing': self.vectorizer = HashingVectorizer(non_negative = True) else: print 'Method must be count, tfidf, or hashing' # vectorize and set up classifier. self.X = self.vectorizer.fit_transform(corpus) classifier = MultinomialNB() self.classifier = classifier.fit(self.X, classes)
class Bayes(object): def __init__(self, config_dir): """Initialize a bayes model. Args: alpha: parameter for bayes model. """ self.name = 'Bayes' self.config_dir = config_dir self.config = dict() self.Vec = None self.clf = None self.output_path = './result/Bayes/' if not os.path.exists(self.output_path): os.mkdir(self.output_path) self.model_path = './model/Bayes/' if not os.path.exists(self.model_path): os.mkdir(self.model_path) def get_config(self): """Get the config from the config file. Put them into the config dictionary. Raises: ValueError: the config file does not exists. """ if os.path.exists(self.config_dir): print('Read in configuration from dir %s.\n' % self.config_dir) with open(self.config_dir + 'Bayes_config', 'rt') as f: for line in f.readlines(): tmp = line.strip().split(':') self.config[tmp[0]] = tmp[1] for key, value in self.config.items(): print('%s:%s' % (key, value)) print('Build up the model according to configuration.') if self.config.get('alpha') == None: raise ValueError( 'Parameter Alpha has not been set. Please re-edit the configuration file.' ) # Build up the feature vector. if self.config.get('feature') == None: raise ValueError( 'Feature parameter has not been set. Please re-edit the configuration file.' ) else: if self.config['feature'] == 'CountVectorizer': self.Vec = CountVectorizer() elif self.config['feature'] == 'TfidfVectorizer': self.Vec = TfidfVectorizer() elif self.config['feature'] == 'HashingVectorizer': self.Vec = HashingVectorizer() else: raise ValueError( 'Can not use %s as a feature, please re-edit your configuration file.' % self.config['feature']) # Build up model. if self.config.get('model') == None: raise ValueError( 'Model parameter has not been set. Please re-edit the configuration file.' ) else: if self.config['model'] == 'GaussianNB': self.clf = GaussianNB() elif self.config['model'] == 'MultinomialNB': self.clf = MultinomialNB( alpha=(float)(self.config['alpha'])) elif self.config['model'] == 'BernoulliNB': self.clf = BernoulliNB() else: raise ValueError( 'No model named %s, please re-edit your configuration file.' % self.config['model']) else: print('Configuration file %s does not exists.' % self.config_dir) def fit(self, train): """Fit the data into the model and train. Args: train: training data in format (data,label) Return: None Raises: ValueError: invalid config value. """ fea = self.Vec.fit_transform(train.data) fea = fea.todense() self.clf.fit(fea, train.target) def predict(self, test_data): """Run the model for a single step to get the predicted result. Args: data_piece: a piece of data fit into the model. Return: predicted result. """ fea = self.Vec.transform(test_data) fea = fea.todense() return self.clf.predict(fea) def test(self, test): """Test the model. Args: model: the model to be evaluated. test: test dataset in format (data,label) Output: result: predicted label. A label each line. Return: accuracy: """ preds = self.predict(test.data) # Write into files. filename = self.output_path + self.config['model'] + '_' + self.config[ 'feature'] with open(filename, 'wt', encoding='utf-8') as f: for pred in preds: f.write(str(pred) + '\n') return evaluate(test.target, preds)
import numpy as np from sklearn.feature_extraction.text import HashingVectorizer os.chdir("E:/graduate/class/非结构化/文本分析_李翠平/垃圾短信数据集与代码") mescon_all = pd.read_csv('result.csv', header=None, encoding='gbk') listtodel = [] #此处目的是将内容为空的短信删除,只保留可以转成特征向量的短信 for i, line in enumerate(mescon_all[1]): if type(line) != np.unicode: listtodel.append(i) mescon_all = mescon_all.drop(listtodel) #vector=TfidfVectorizer(CountVectorizer()) #temp = vector.fit_transform(mescon_all[1]).todense() outfile = open('features.txt', 'w') vector = HashingVectorizer(n_features=100) temp = vector.transform(mescon_all[1]).todense() x = [[i, j] for i, j in enumerate(mescon_all[0])] temp = temp.tolist() for i, line in enumerate(temp): outstr = '' for word in line: outstr += str(word + 1) outstr += ' ' outfile.write((str(mescon_all[0][x[i][1]]) + ',' + outstr) + '\n') outfile.close() #unicode是np里面的
from tqdm import tqdm from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer from sklearn.pipeline import Pipeline # == Load data == print("Loading dataset...") files = os.listdir() D = [] # Label: # - dry: 0 # - normal: 1 # - oil: 2 label = np.array([]) print(len(D), " documents, ", len(label), " labels.") # == Chinese Segmentation == for # == Vector Transformation == print("Extracting features from the dataset...") vectorizer = Pipeline([ ('vect', HashingVectorizer(n_features=(2 ** 21), non_negative=True, lowercase=False)), ('tfidf', TfidfTransformer(norm='l2')), ]) if __name__ == '__main__': vectorizer.fit()
split_size = int(len(news.data) * SPLIT_PERC) X_train = news.data[:split_size] X_test = news.data[split_size:] y_train = news.target[:split_size] y_test = news.target[split_size:] from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer clf_1 = Pipeline([ ('vect', CountVectorizer()), ('clf', MultinomialNB()), ]) clf_2 = Pipeline([ ('vect', HashingVectorizer(non_negative=True)), ('clf', MultinomialNB()), ]) clf_3 = Pipeline([ ('vect', TfidfVectorizer()), ('clf', MultinomialNB()), ]) from sklearn.cross_validation import cross_val_score, KFold from scipy.stats import sem def evaluate_cross_validation(clf, X, y, K): # create a k-fold croos validation iterator of k=5 folds cv = KFold(len(y), K, shuffle=True, random_state=0)
#print('using counts of rel. entity cluster nodes') ##clusters = int(X.shape[1]/(10*math.sqrt(s))) #use number of tfidf extracted features as basis for word vector clustering #kmeans = KMeans(n_clusters = clusters) #nodes = kmeans.fit_predict(np.asarray(rel_vectors)) #X = np.zeros([s+1, clusters]) #for source, node in zip(rel_sources, nodes): # X[source, node] += 1 #evaluate(MultinomialNB(), X) #print('repeat with -',names[0]) #evaluate(classifiers[0], X) for model, name in zip(classifiers, names): print('using hashed count vectors -', name) hasher = HashingVectorizer(n_features=1000, stop_words='english', norm='l2') vectorizer = make_pipeline(hasher, TfidfTransformer()) X = vectorizer.fit_transform(files) evaluate(model, X) print('using avg softmax -', name) X = np.asarray(probs) evaluate(model, X) print('now try with each feature in turn -', name) for feature in range(X.shape[1]): X = np.asarray([[f[feature]] for f in probs]) evaluate(model, X) print('using avg similarity -', name)
def trainAndEvaluateModels(dataFrame): # We'll test-drive two vectorizers. HashingVectorizer is famed to be memory-efficient! vectorizers = {'CountVectorizer', 'HashingVectorizer', 'TfidfVectorizer'} # We'll also try out some classifiers. Should be fun! classifiers = { 'MultinomialNB', 'BernoulliNB', 'SGDClassifier', 'PassiveAggressiveClassifier' } for vectorizer in vectorizers: #For each combination of vectorizer and classifier for classifier in classifiers: vect = None if (vectorizer == 'CountVectorizer'): vect = CountVectorizer() elif (vectorizer == 'HashingVectorizer'): vect = HashingVectorizer(stop_words='english', non_negative=True, norm=None, binary=False) elif (vectorizer == 'TfidfVectorizer'): vect = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english', use_idf=True) clf = None if (classifier == 'MultinomialNB'): clf = MultinomialNB() elif (classifier == 'BernoulliNB'): clf = BernoulliNB() elif (classifier == 'SGDClassifier'): clf = SGDClassifier() elif (classifier == 'PassiveAggressiveClassifier'): clf = PassiveAggressiveClassifier() #Some spirits don't mix! if (vectorizer == 'HashingVectorizer' and classifier == 'BernoulliNB'): continue if (vectorizer != 'TfIdfVectorizer'): #Setup a pipeline to vectorize and classify data. pipeline = Pipeline([('vectorizer', vect), ('classifier', clf)]) #We will divide our dataset into 10 pieces, train on 9 of them and test on the remaining one. #We will do this until each piece has been the test piece atleast once. k_fold = KFold(n=len(dataFrame), n_folds=10) totalScore = 0 for train_indices, test_indices in k_fold: train_text = dataFrame.iloc[train_indices]['text'].values train_y = dataFrame.iloc[train_indices]['label'].values test_text = dataFrame.iloc[test_indices]['text'].values test_y = dataFrame.iloc[test_indices]['label'].values #Train the model on the training set pipeline.fit(train_text, train_y) #Test the model on the test set predictions = pipeline.predict(test_text) #print predictions score = f1_score(test_y, predictions) totalScore = totalScore + score print 'Vectorizer: ', vectorizer, ' Classifier: ', classifier, ' Average prediction score: ', ( totalScore / 10) #Some spirits don't mix!! if classifier == 'MultinomialNB': continue #Setup a pipeline to work with a transformer too. pipeline1 = Pipeline([('vector', vect), ('transform', TfidfTransformer()), ('classifier', clf)]) #We will divide our dataset into 10 pieces, train on 9 of them and test on the remaining one. #We will do this until each piece has been the test piece atleast once. k_fold = KFold(n=len(dataFrame), n_folds=10) totalScore = 0 for train_indices, test_indices in k_fold: train_content = dataFrame.iloc[train_indices]['text'].values train_labels = dataFrame.iloc[train_indices]['label'].values test_content = dataFrame.iloc[test_indices]['text'].values test_labels = dataFrame.iloc[test_indices]['label'].values #Train the model on the training set pipeline1.fit(train_content, train_labels) #Test the model on the test set predictions = pipeline1.predict(test_content) score = f1_score(test_labels, predictions) totalScore = totalScore + score print 'Vectorizer: ', vectorizer, ' TfIdfTransformer, Classifier: ', classifier, ' Average prediction score: ', ( totalScore / 10)
import os import re from nltk.corpus import stopwords from sklearn.feature_extraction.text import HashingVectorizer curdir = os.path.dirname(__file__) def tokenizer(text): text = re.sub('<[^>]*>', '', text) emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')) tokenized = [ w for w in text.split() if w not in stopwords.words('english') ] return tokenized vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)
def _get_vectorizer(vectorizer, training_mode, vectorizer_file="vectorizer.pkl"): token_pattern = r'\S+' if not training_mode and vectorizer not in [ DocumentPoolEmbeddings.__name__ ]: v = pickle_manager.load(vectorizer_file) assert vectorizer == v.__class__.__name__ elif vectorizer == TfidfVectorizer.__name__: v = TfidfVectorizer(input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=[], token_pattern=token_pattern, ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.float64, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False) elif vectorizer == CountVectorizer.__name__: v = CountVectorizer(input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=[], token_pattern=token_pattern, ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.int64) elif vectorizer == HashingVectorizer.__name__: v = HashingVectorizer(input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=[], token_pattern=token_pattern, ngram_range=(1, 1), analyzer='word', n_features=1048576, binary=False, norm='l2', alternate_sign=True, non_negative=False, dtype=np.float64) elif vectorizer == DocumentPoolEmbeddings.__name__: v = DocumentPoolEmbeddings( [BertEmbeddings('bert-base-multilingual-uncased')]) else: raise ValueError("Invalid vectorizer: %s" % (vectorizer)) return v
def jieba_tokenize(text): return jieba.cut(text) tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenize, lowercase=False) ''' tokenizer: 指定分词函数 lowercase: 在分词之前将所有的文本转换成小写,因为涉及到中文文本处理, 所以最好是False ''' text_list = [ "今天天气真好啊啊啊啊", "小明上了清华大学", "我今天拿到了Google的Offer", "清华大学在自然语言处理方面真厉害" ] hv = HashingVectorizer(tokenizer=jieba_tokenize, n_features=10) tt = hv.transform(text_list) print(tt) # 需要进行聚类的文本集 tfidf_matrix = tfidf_vectorizer.fit_transform(text_list) print(tfidf_vectorizer) # print(tfidf_matrix) terms = tfidf_vectorizer.get_feature_names() print(terms) print(len(terms))
import pandas as pd import matplotlib.pyplot as plt y_train = pd.read_table('F:\\WeiWeiHe\\data_y.txt',header=None) y_train = np.array(y_train).ravel() X_train = corpus '''获取Y中标签的类别总数''' true_k = np.unique(y_train).shape[0] print y_train X_test = X_train Y_test = y_train from sklearn.feature_extraction.text import TfidfVectorizer,HashingVectorizer,TfidfTransformer from sklearn.pipeline import make_pipeline hasher = HashingVectorizer(n_features,non_negative=True,binary=False) '''根据实际情况可能选择多种不同参数, 1)构建多从选择参数''' from optparse import OptionParser import sys op = OptionParser() op.add_option("--lsa",dest="n_components", type="int",help="Preprocess documents with latent semantic analysis.") op.add_option("--no-minibatch",action="store_false", dest="minibatch", default=True, help="Use ordinary k-means algorithm (in batch mode).") op.add_option("--no-idf",action="store_false", dest="use_idf", default=True, help="Disable Inverse Document Frequency feature weighting.") op.add_option("--use-hashing",action="store_true",default=False,help="using a hashing feature vectorizer") op.add_option("--n-features",type=int,default=10000,help="Maximum number of features(dimensions to extract from text.)") op.add_option("--verbose",action="store_true",dest="verbose",default=False,help="print report inside k-means") op.print_help() (opts,args) = op.parse_args()
train = train[train.year != 1] train = train.merge(artist, how='inner', on='artist_id') test = test.merge(artist, how='inner', on='artist_id') del artist train.song_hotttnesss[train.song_hotttnesss > train.song_hotttnesss.mean()] = 1 train.song_hotttnesss[train.song_hotttnesss < train.song_hotttnesss.mean()] = 0 train.year = (train.year // 10) * 10 test.year = (test.year // 10) * 10 CategoricalFeatures = train[['artist_id', 'title', 'audio_md5']] from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer vectorizer = HashingVectorizer(n_features=750) #vectorizer = TfidfVectorizer(min_df = 0.0002) TfidfVectorizerObject = vectorizer.fit(pd.concat([train.title, test.title])) CountVectorizerTrainData = TfidfVectorizerObject.transform(train["title"]) CountVectorizerTestData = TfidfVectorizerObject.transform(test["title"]) DropFeatures = [ 'song_id', 'artist_id', 'title', 'audio_md5', 'analysis_sample_rate', 'key_confidence', 'audio_md5', 'year', 'end_of_fade_in', 'duration', 'time_signature_confidence', 'artist_latitude', 'artist_longitude' ] trainSongId = train[['song_id']] train = train.drop(DropFeatures, axis=1) song_id = test['song_id']
def text_vectorizer_hashing_vectorizer(plots): vectorizer = HashingVectorizer(n_features=1000) return vectorizer.transform(plots)
sorted(zip(clf.coef_[0], feature_names), reverse=True)[:20] ### Most fake sorted( zip(clf.coef_[0], feature_names) )[: 20] # clearly there are certain words which might show political intent and source in the top fake features (such as the words corporate and establishment). tokens_with_weights = sorted(list(zip(feature_names, clf.coef_[0]))) # print(tokens_with_weights) # -------------------------------------------------------------- # HashingVectorizer : require less memory and are faster (because they are sparse and use hashes rather than tokens) # -------------------------------------------------------------- hash_vectorizer = HashingVectorizer(stop_words='english', non_negative=True) hash_train = hash_vectorizer.fit_transform(X_train) hash_test = hash_vectorizer.transform(X_test) # -------------------------------------------------------------- # Naive Bayes classifier for Multinomial model # -------------------------------------------------------------- clf = MultinomialNB(alpha=.01) clf.fit(hash_train, y_train) pred = clf.predict(hash_test) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']) plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
class Classifier: """ Wrapper class for model. Exposes two functions, train and predict. """ _model: OneVsOneClassifier _modelTags: dict _labelDict: dict _Encoder: LabelEncoder _hashVect: HashingVectorizer modelName: str savePath: str _labelAll = ["negative", "neutral", "positive"] _stopwords = [ 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'ma' ] _auxverbs = {"wo": "will", "sha": "shall"} def __init__(self, makeNewModel: bool, modelName: str = None): """ Constructor for the class.\n :param makeNewModel: Set to true if making new model. False otherwise. :param modelName: If makeNewModel is True, this parameter will be ignored. Otherwise, loads a model with the name supplied. """ if not makeNewModel and re.search("classifier_\\d{11}(\\.P)?", modelName) is None: makeNewModel = True if not makeNewModel: if not modelName.endswith(PICKLE_FILE_EXTENSION): modelName += PICKLE_FILE_EXTENSION self.modelName = modelName self.savePath = model_folder + self.modelName self._model, self._modelTags = self._loadModel(self.savePath) else: self.modelName = "classifier_" + getEpochIdentifier( ) + PICKLE_FILE_EXTENSION self.savePath = model_folder + self.modelName self._initUser() self._Encoder = LabelEncoder() self._Encoder.fit_transform(self._labelAll) self._labelDict = dict( zip(self._Encoder.classes_, self._Encoder.transform(self._Encoder.classes_))) self._hashVect = HashingVectorizer(decode_error='replace', n_features=2**20, alternate_sign=False) # initialize model for each sentiment by passing blank data for sents in self._labelAll: self.train(text='', sentiment=sents) def train(self, text: str or list, sentiment: str, tags=None, returnProcessedText=False, fromDB=False): """ Wrapper method that trains the model on only a single data. Set fromDB = True if the data is from the database. :param text: A string or a list. List should only be used on data already preprocessed, i.e. from database. :param sentiment: The sentiment for the data. Accepted inputs are only 'positive', 'negative' or 'neutral'. :param returnProcessedText: Set to true to return the processed text. Use to save to database. :param fromDB: Set to true only if the data (list) is from the database. :return: The processed text as a list. Only returns if returnProcessedText = True. """ if type(text) is str: text = self._preprocessText(text) elif type(text) is list: if fromDB: # required for already processed text stored in dbs text = self._preprocessFromDB(text) text = self._getSeriesFromList(text) if type(tags) is str: tags = separateTags(tags) self._saveModel(self._train(text, sentiment, tags), self.savePath) if returnProcessedText: text = list(text) return text def predict(self, rawText: str): """ Wrapper method that predicts a single piece of data. :param rawText: The raw (unprocessed) text to predict on. :return: The sentiment of the predicted text and the tags. """ processedText = self._preprocessText(rawText) return self._predict(processedText) @staticmethod def _preprocessFromDB(processedText: str) -> list: """ Wrapper method for when the data is from the database. Separates the string data into a list of words. :param processedText: The already-processed text from the database. :return: Returns a list to be trained. """ return _separateTags(processedText) @staticmethod def _getSeriesFromList(processedTextList: list) -> pd.Series: """ Gets a pandas series from a list of processed text. :param processedTextList: A list containing the processed text/words. :return: Returns a pandas series of the text. """ return pd.Series(' '.join(processedTextList)) def _preprocessText(self, pre_text): """ Performs preprocessing on the text. :param pre_text: A unprocessed string to perform text on. Accepts tuple, list and str. :return: A pandas series of processed text. """ if type(pre_text) is list or type(pre_text) is tuple: pre_text = ' '.join(pre_text) # remove HTMl tags, if any pre_text = self._strip_html(pre_text) # split into multiple sentences text_tuple = sent_tokenize(pre_text, language='english') # tokenize text_tuple = [ word_tokenize(sentence, language='english') for sentence in text_tuple ] for sentence in text_tuple: for index, word in enumerate(sentence): sentence[index] = word.lower() if word == "n't": sentence[index] = "not" if sentence[index - 1] in self._auxverbs: sentence[index - 1] = self._auxverbs[sentence[index - 1]] # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. tag_map = defaultdict(lambda: wn.NOUN) tag_map['J'] = wn.ADJ tag_map['V'] = wn.VERB tag_map['R'] = wn.ADV # remove stop words + lemmatization filtered_final = [] for index, entry in enumerate(text_tuple): word_Lemmatized = WordNetLemmatizer() for word, tag in pos_tag(entry): if word not in self._stopwords and word.isalpha(): word_Final = word_Lemmatized.lemmatize( word, tag_map[tag[0]]) filtered_final.append(word_Final) # change all text to lower case filtered_final = [_entry.lower() for _entry in filtered_final] # concatenate into one data instead of multiple individual words _textSeries = pd.Series(' '.join(filtered_final)) return _textSeries def _initUser(self): """ Wrapper method to initialize a first time user. Creates a new model and saves the model. """ self._model = self._makeNewClassifier() self._modelTags = dict() self._saveModel([self._model, self._modelTags], self.savePath) @staticmethod def _strip_html(text: str): """ Strips HTMl tags from a string text. :param text: A raw unprocessed text. :return: The same text with HTMl tags stripped. """ soup = BeautifulSoup(text, "html.parser") return soup.getText() @staticmethod def _makeNewClassifier(): """ Makes a new model using OneVsOne classification. Model is a Logistic Regression algorithm fitted with SGD. :return: A OneVsOneClassifier object. """ return OneVsOneClassifier( SGDClassifier(loss='log', penalty='l2', max_iter=150, learning_rate='optimal', eta0=0.00, alpha=1e-04)) def _makeNewModelDict(self, tags): """ Creates a dict and initializes into the modelTags using the tag name as the key and the corresponding model as the value.\n The model is a binary class where class [0] is the tag (positive class) and class [1] is not the tag (negative class). For N number of tags, this dict will have N number of key-value pairs. This dict is essentially multiple OneVsRestClassifiers in one list.\n :param tags: List of string tags. """ for eachTag in tags: self._modelTags[eachTag] = MultinomialNB() @staticmethod def _saveModel(save_classifier, _filename): """ Saves a model locally. The models are saved in the ./Models/ folder. :param save_classifier: The model/classifier to save. :param _filename: The filename to save as. """ saveFile = open(_filename, 'wb') pickle.dump(save_classifier, saveFile) saveFile.close() @staticmethod def _loadModel(_filename): """ Loads a model from ./Models/ folder. :param _filename: The filename to load the classifier from. :return: The loaded model/classifier. """ loadFile = open(_filename, 'rb') classifier, modelTags = pickle.load(loadFile) loadFile.close() return classifier, modelTags def _train(self, _text, _sentiment, userTags): """ The inner method for training the model. The model is trained using partial_fit function. :param _text: A preprocessed pandas series. :param _sentiment: The raw string sentiment, either 'positive', 'negative' or 'neutral'. :return: The trained model. """ encSentiment = self._labelDict.get(_sentiment) X_new = self._hashVect.transform(_text) self._model.partial_fit( X_new, [encSentiment], self._Encoder.transform(self._Encoder.classes_)) # if not training the tag model if userTags is None: return self._model, self._modelTags # check if modelTags contain any existing tag classes # if yes, check if user added any new tags if len(self._modelTags.keys()) != 0: newTags = list(set(self._modelTags.keys()) - set(userTags)) # adds the new tag to the dict and add a new model if there are new tags if len(newTags) != 0: for eachNewTag in newTags: self._modelTags[eachNewTag] = MultinomialNB() else: # if modelTag doesn't contain any tag classes, initialize it for eachTag in userTags: self._modelTags[eachTag] = MultinomialNB() for eachTag in self._modelTags: if eachTag in userTags: self._modelTags[eachTag].partial_fit(X_new, [0], [0, 1]) else: self._modelTags[eachTag].partial_fit(X_new, [1], [0, 1]) return self._model, self._modelTags def _predict(self, _text): """ The inner method for getting predictions from the model. :param _text: A preprocessed pandas series. :return: The string sentiment, either 'positive', 'negative' or 'neutral'. """ X_new = self._hashVect.transform(_text) sentiment = getDictKey(self._labelDict, self._model.predict(X_new)) retTags = list() for eachTag in self._modelTags: if self._modelTags[eachTag].predict(X_new) == [0]: retTags.append(eachTag) return sentiment, retTags
from sklearn.datasets import fetch_20newsgroups use_hashing = False use_idf = False minibatch = True verbose = False n_features = 10000 n_components = 10 true_k = 10 if use_hashing: if use_idf: hasher = HashingVectorizer(n_features=n_features, stop_words='english', alternate_sign=False, norm=None, binary=False) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer(n_features=n_features, stop_words='english', alternate_sign=False, norm='l2', binary=False) else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, stop_words='english', use_idf=use_idf)
def __init__(self, chunksize=100000, **kwargs): self.chunksize = chunksize HashingVectorizer.__init__( self, **kwargs)
X = np.array(["numpy", "scipy", "sklearn"]) vectorizer = TfidfVectorizer(dtype=vectorizer_dtype) warning_msg_match = "'dtype' should be used." warning_cls = UserWarning expected_warning_cls = warning_cls if warning_expected else None with pytest.warns(expected_warning_cls, match=warning_msg_match) as record: X_idf = vectorizer.fit_transform(X) if expected_warning_cls is None: relevant_warnings = [w for w in record if isinstance(w, warning_cls)] assert len(relevant_warnings) == 0 assert X_idf.dtype == output_dtype @pytest.mark.parametrize("vec", [ HashingVectorizer(ngram_range=(2, 1)), CountVectorizer(ngram_range=(2, 1)), TfidfVectorizer(ngram_range=(2, 1)) ]) def test_vectorizers_invalid_ngram_range(vec): # vectorizers could be initialized with invalid ngram range # test for raising error message invalid_range = vec.ngram_range message = ("Invalid value for ngram_range=%s " "lower boundary larger than the upper boundary." % str(invalid_range)) if isinstance(vec, HashingVectorizer): pytest.xfail(reason='HashingVectorizer not supported on PyPy') assert_raise_message(ValueError, message, vec.fit, ["good news everyone"]) assert_raise_message(ValueError, message, vec.fit_transform,
next(stream_docs(path='./movie_data.csv')) def get_minibatch(doc_stream, size): docs, y = [], [] try: for _ in range(size): text, label = next(doc_stream) docs.append(text) y.append(label) except StopIteration: return None, None return docs, y vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer) clf = SGDClassifier(loss='log', random_state=1, n_iter=1) doc_stream = stream_docs(path='./movie_data.csv') pbar = pyprind.ProgBar(45) classes = np.array([0, 1]) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if not X_train: break X_train = vect.transform(X_train) clf.partial_fit(X_train,y_train, classes=classes) pbar.update()
)), ('scale', MaxAbsScaler()), ('clf', OneVsRestClassifier(LogisticRegression())) ]) TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)' # Import the hashing vectorizer p2 = Pipeline([ ('union', FeatureUnion( transformer_list = [ ('numeric_features', Pipeline([ ('selector', get_numeric_data), ('imputer', Imputer()) ])), ('text_features', Pipeline([ ('selector', get_text_data), ('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC, non_negative=True, norm=None, binary=False, ngram_range=(1,2))), ('dim_red', SelectKBest(chi2, chi_k)) ])) ] )), ('int', SparseInteractions(degree=2)), ('scale', MaxAbsScaler()), ('clf', XGBClassifier()))) ])
from sklearn.feature_extraction.text import HashingVectorizer corpus = [ "Preach lol! :) RT @mention: #alliwantis this type of weather all the time.. I live for beautiful days like this! #minneapolis", "@mention good morning sunshine", "rhode island", "RT @mention: I absolutely love thunderstorms!", "@mention right this weather is something else", "TOP CHOICE --> {link} - Today is awesome!!! Free comic books, lunch with my mama, sunshine & DJ'n ... (via @mention)", "CCAk Trail Update: Archangel Road, Mat-Su - 8:00 PM, Thu May 05, 2011: Snow column beginning to break up especia... {link}" ] counts = [[3, 0, 1], [2, 0, 0], [3, 0, 0], [4, 0, 0], [3, 2, 0], [3, 0, 2]] # count vectorizer vectorizer = CountVectorizer(min_df=1) X = vectorizer.fit_transform(corpus) X.toarray() # tfidf transformer transformer = TfidfTransformer() tfidf = transformer.fit_transform(counts) tfidf.toarray() # combination : tfidf vectorizer vectorizer = TfidfVectorizer(min_df=1) X = vectorizer.fit_transform(corpus) X.toarray() # hasher : save time and space hv = HashingVectorizer() hv.transform(corpus)
#!/usr/bin/python #coding:utf-8 import sys from sklearn.svm import LinearSVC #from sklearn.ensemble import RandomForestClassifier from sklearn.feature_extraction.text import HashingVectorizer if sys.version_info[0]>=3: raw_input=input transformer=HashingVectorizer(stop_words='english') _train=[] train_label=[] f=open('trainingdata.txt') for i in range(int(f.readline())): s=f.readline().rstrip() idx=s.find(' ') _train.append(s[idx+1:]) train_label.append(int(s[:idx])) f.close() train = transformer.fit_transform(_train) svm=LinearSVC() svm.fit(train,train_label) _test=[] for i in range(int(raw_input())): s=raw_input().rstrip() _test.append(s) test = transformer.transform(_test) test_label=svm.predict(test) for e in test_label: print(e)
def run(keyn, nPart): all_classes = np.array([0, 1]) allKeys = [l.split()[0] for l in open('keywordsAll.txt').readlines()] keyFreqs = [ float(l.split()[1]) / 4205907 for l in open('keywordsAll.txt').readlines() ] key = allKeys[keyn] freq = keyFreqs[keyn] opt = 'body+title+code' bv = 'True' nneg = 'True' nv = 'None' #testopt = 'c' #testopt = 'w' #testopt = 'l2' testopt = 'l1' if testopt == 'c': cls = SGDClassifier(loss='hinge', learning_rate="constant", alpha=1e-6, eta0=1e-2, penalty='l2') elif testopt == 'w': cls = SGDClassifier(class_weight={1: 1.0 / freq / 8.0, 0: 1}) elif testopt == 'l2': cls = SGDClassifier(loss='log', alpha=1e-5, penalty='l2') elif testopt == 'l1': cls = SGDClassifier(loss='log', alpha=1e-5, penalty='l1') outputName = 'key_' + str( keyn) + '_SGDtune_' + opt + '_partialfit_' + testopt + '.txt' pklName = 'SGD_key_' + str(keyn) + '_' + testopt + '.pkl' n0, ntrain = resumeJob(outputName, pklName) body_test, y_test = getTestSet(10, key, opt, testSize=0.2, seed=123) tot_pos = sum(y_test) vectorizer = HashingVectorizer(decode_error='ignore', n_features=2**20, token_pattern=r"\b\w[\w#+.-]*(?<!\.$)", binary=str2bool(bv), norm=normOpt(nv), non_negative=str2bool(nneg)) X_test = vectorizer.transform(body_test) #print 'test case:', len(y_test), 'positive', tot_pos, 'key:', key, 'X norm:', X_test.sum(), 'binary:', bv, 'norm:', nv, 'nneg:', nneg if n0 >= 2: cls = joblib.load(pklName) for n in xrange(n0, 10): outfile = open(outputName, 'a') data = json.load(gzip.open('Train.rdup.' + str(n) + '.json.gz')) minibatch_size = len(data) / nPart + 1 for i in xrange(nPart): n1 = i * minibatch_size n2 = (i + 1) * minibatch_size if i == nPart - 1: n2 = len(data) ntrain += (n2 - n1) body_train, y_train = getMiniBatch(data, n1, n2, key, opt) X_train = vectorizer.transform(body_train) shuffledRange = range(n2 - n1) for n_iter in xrange(5): X_train, y_train = shuffle(X_train, y_train) cls.partial_fit(X_train, y_train, classes=all_classes) y_pred = cls.predict(X_test) f1 = metrics.f1_score(y_test, y_pred) p = metrics.precision_score(y_test, y_pred) r = metrics.recall_score(y_test, y_pred) accu = cls.score(X_train, y_train) y_pred = cls.predict(X_train) f1t = metrics.f1_score(y_train, y_pred) outfile.write( "%3d %8d %.4f %.3f %.3f %.3f %.3f %5d %5d\n" % (n, ntrain, accu, f1t, f1, p, r, sum(y_pred), tot_pos)) _ = joblib.dump(cls, pklName, compress=9) outfile.close()
import re from sklearn.feature_extraction.text import HashingVectorizer import pickle import os cur_dir = os.path.dirname(__file__) stop = pickle.load( open(os.path.join(cur_dir, 'pkl_objects', 'stopwords.pkl'), 'rb')) def preprocessor(text): text = re.sub('<[^>]*>', '', text) emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')) return text def tokenizer_porter(text): porter = PorterStemmer() line = [porter.stem(word) for word in text.split()] return line vect_optimized = HashingVectorizer(decode_error='ignore', norm=None, n_features=2**21, preprocessor=preprocessor, stop_words=stop, tokenizer=tokenizer_porter)
print("%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % (len(data_test.data), data_test_size_mb)) print("%d categories" % len(categories)) print() # split a training set and a test set y_train, y_test = data_train.target, data_test.target print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train.data) else: vectorizer = CountVectorizer(ngram_range=(2, 2), max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test.data)
sdf = spark.createDataFrame(pdf) cols = [F.col(str(c)) for c in sdf.columns] # apply predict UDFs and select prediction output prediction_df = (sdf.withColumn("scores", predict_proba(*cols)).withColumn( "preds", predict(*cols)).select("preds", "scores")) prediction_df.show() # single text feature data = fetch_20newsgroups(shuffle=True, random_state=1, remove=("headers", "footers", "quotes")) X = data["data"][:100] y = data["target"][:100] model = Pipeline([ ("vec", HashingVectorizer()), ("clf", LogisticRegression(solver="liblinear", multi_class="auto")), ]) model.fit(X, y) # get UDFs with 'text' feature types predict = get_prediction_udf(model, method="predict", feature_type="text") predict_proba = get_prediction_udf(model, method="predict_proba", feature_type="text") # create PySpark DataFrame from features pdf = pd.DataFrame(X) sdf = spark.createDataFrame(pdf) cols = [F.col(str(c)) for c in sdf.columns]
clf = RandomForestClassifier(n_estimators=50) clf = clf.fit(train_data_features[0:8301], topics_has_earn_word[0:8301]) print "CountVecorizer BoW encoding \n" \ "80% train data, 20% test data \n" \ "Using 50 trees in RandomForestClassifier \n" \ "Execution time: " , datetime.now() - old_time score = clf.score(train_data_features[8301:], topics_has_earn_word[8301:]) print "Score:", score * 100, "\n" # ------------------ BoW using feature hashing ----------------- # from http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html#sphx-glr-auto-examples-text-document-classification-20newsgroups-py vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=1000) X_train = vectorizer.transform(data_clean_train) old_time = datetime.now() clf = RandomForestClassifier(n_estimators=50) clf.fit(X_train[0:8301], topics_has_earn_word[0:8301]) print "HashingVecorizer BoW encoding \n" \ "80% train data, 20% test data \n" \ "Using 50 trees in RandomForestClassifier \n" \ "Execution time: " , datetime.now() - old_time score = clf.score(X_train[8301:], topics_has_earn_word[8301:]) print "Score:", score * 100
print("%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % (len(data_test.data), data_test_size_mb)) print("%d categories" % len(target_names)) print() # split a training set and a test set y_train, y_test = data_train.target, data_test.target print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, n_features=opts.n_features) X_train = vectorizer.transform(data_train.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test.data)
parser = ReutersParser() for filename in glob(os.path.join(data_path, "*.sgm")): for doc in parser.parse(open(filename, 'rb')): yield doc ############################################################################### # Main # ---- # # Create the vectorizer and limit the number of features to a reasonable # maximum vectorizer = HashingVectorizer(decode_error='ignore', n_features=2**18, alternate_sign=False) # Iterator over parsed Reuters SGML files. data_stream = stream_reuters_documents() # We learn a binary classification between the "acq" class and all the others. # "acq" was chosen as it is more or less evenly distributed in the Reuters # files. For other datasets, one should take care of creating a test set with # a realistic portion of positive instances. all_classes = np.array([0, 1]) positive_class = 'acq' # Here are some classifiers that support the `partial_fit` method partial_fit_classifiers = { 'SGD': SGDClassifier(),
def func(): hv = HashingVectorizer() hv.fit_transform(['hello world', np.nan, 'hello hello'])
"WordBag", ] data_sizes= [40000, 80000, 160000, 320000, 640000, 1280000] for task in tasks: for data_size in data_sizes: texts_chunk = texts[:data_size] print("Task:", task, "Data size:", data_size) for backend in backends: batcher = Batcher(procs=16, minibatch_size=5000, backend=backend[0], backend_handle=backend[1]) #try: with timer("Completed: ["+task+","+str(len(texts_chunk))+","+backend[0]+"]"), warnings.catch_warnings(): warnings.simplefilter("ignore") if task=="ApplyBatch": hv = HashingVectorizer(decode_error='ignore', n_features=2 ** 25, preprocessor=normalize_text, ngram_range=(1, 2), norm='l2') t= ApplyBatch(hv.transform, batcher=batcher).transform(texts_chunk) print(t.shape, t.data[:5]) if task=="WordBag": wb = WordBatch(normalize_text=normalize_text, dictionary=Dictionary(min_df=10, max_words=1000000, verbose=0), tokenizer= Tokenizer(spellcor_count=2, spellcor_dist=2, stemmer= stemmer), extractor=WordBag(hash_ngrams=0, norm= 'l2', tf= 'binary', idf= 50.0), batcher= batcher, verbose= 0) t = wb.fit_transform(texts_chunk) print(t.shape, t.data[:5]) # except: # print("Failed ["+task+","+str(len(texts_chunk))+","+backend[0]+"]") print("")