def get_hash_vectorizer(fname, n=20): dfx = pd.read_csv(fname) sentence_list = [sent for sent in dfx['clean_text'].values] hasher = HashingVectorizer(n_features=n) hasher.fit(sentence_list) return hasher pass
def vectorize(self, X_text): news_df = X_text hash_text = HashingVectorizer(ngram_range=(3, 7), analyzer="char", alternate_sign=False) hash_title = HashingVectorizer(ngram_range=(3, 7), analyzer="char", alternate_sign=False) hash_author = HashingVectorizer(ngram_range=(3, 7), analyzer="char", alternate_sign=False) X_text = news_df['text'] hash_text.fit(X_text) text_vector = hash_text.fit_transform(X_text.values.astype('U')) self.text_vector = text_vector X_title_text = news_df['title'] print(text_vector[:1]) title_vector = hash_title.fit_transform( X_title_text.values.astype('U')) self.title_vector = title_vector X_author = news_df['author'] author_vector = hash_author.fit_transform(X_author.values.astype('U')) self.author_vector = author_vector return author_vector
def vectorize(self, X_text): # Method takes X_text array as an argument with title, author, and text in that order. news = X_text hash_text = HashingVectorizer(ngram_range=(3, 7), analyzer="char", alternate_sign=False) hash_title = HashingVectorizer(ngram_range=(3, 7), analyzer="char", alternate_sign=False) hash_author = HashingVectorizer(ngram_range=(3, 7), analyzer="char", alternate_sign=False) X_text = news[:, 2] print('X_text') print(X_text[:5]) hash_text.fit(X_text) text_vector = hash_text.fit_transform(X_text) self.text_vector = text_vector X_title_text = news[:, 0] print(text_vector[:1]) title_vector = hash_title.fit_transform(X_title_text) self.title_vector = title_vector X_author = news[:, 1] author_vector = hash_author.fit_transform(X_author) self.author_vector = author_vector return author_vector
class FeatureExtractor(object): def __init__(self, count=True, hashing=False): self.count = count self.hashing = hashing self.was_fit = False def fit(self, X_text): if self.count: self.cv = CountVectorizer() self.cv.fit(X_text) if self.hashing: self.hv = HashingVectorizer(ngram_range=(1,2), norm=None, alternate_sign=False, binary=True) self.hv.fit(X_text) self.was_fit = True return def transform(self, X_text): assert self.was_fit if self.count: X_count = self.cv.transform(X_text) X = X_count if self.hashing: X_hashing = self.hv.transform(X_text) X = X_hashing if self.hashing and self.count: X = np.hstack([X_count, X_hashing]) return X
def train(cls, trn_corpus, config=None, dtype=np.float32): """Train on a corpus. Args: trn_corpus (list): Training corpus in the form of a list of strings. config (dict): Dict with keyword arguments to pass to sklearn's HashingVectorizer. dtype (type, optional): Data type. Default is `numpy.float32`. Returns: Hashing: Trained vectorizer. Raises: Exception: If `config` contains keyword arguments that the hashing vectorizer does not accept. """ defaults = { "encoding": "utf-8", "strip_accents": "unicode", "stop_words": None, "ngram_range": (1, 2), "lowercase": True, "norm": "l2", "dtype": dtype, "n_features": 1048576, # default number in HashingVectorizer } try: model = HashingVectorizer(**{**defaults, **config}) except TypeError: raise Exception( f"vectorizer config {config} contains unexpected keyword arguments for HashingVectorizer" ) model.fit(trn_corpus) return cls(model)
def train_hash_vectorizer(train, test, question): hash_vectorizer = HashingVectorizer(ngram_range=(1, 1)) tfidf_txt = pd.Series(train[question + '1'].tolist() + train[question + '2'].tolist() + test[question + '1'].tolist() + test[question + '2'].tolist()).astype(str) hash_vectorizer.fit(tfidf_txt) return hash_vectorizer
def BOW(): from sklearn.feature_extraction.text import HashingVectorizer vectorizer = HashingVectorizer(ngram_range=(2, 2)) vectorizer.fit(training_tweets) train_sequences = vectorizer.transform(training_tweets) val_sequences = vectorizer.transform(val_tweets) test_sequences = vectorizer.transform(test_tweets)
def test_hashing_vectorizer(): for norm in ["l1", "l2", None]: vec = HashingVectorizer(n_features=2**8, norm=norm) vec.fit(X) vec_ = convert_estimator(vec) X_t = vec.transform(X) X_t_ = vec_.transform(X) assert np.allclose( vec.transform(X).toarray(), vec_.transform(X).todense())
def __wordhash_features(self, data, vect=None, num_features=3000): ''' extracts word ngram features from the provided data ''' if vect is None: vect = HashingVectorizer(n_features=num_features, analyzer="word", stop_words='english', strip_accents='unicode', ngram_range=(1, 4)) vect.fit(data) features = vect.transform(data) return features, vect
def tfidf_process_ci(data,train_data,test_data): y = train_data['Score'] tf1 = TfidfVectorizer(ngram_range=(1,6),token_pattern='\w+',analyzer='word') tf1.fit(data['cutted_Dis']) data1=tf1.transform(train_data['cutted_Dis']) test1 = tf1.transform(test_data['cutted_Dis']) print(data1.shape) tf2 = HashingVectorizer(ngram_range=(1,2),lowercase=False) tf2.fit(data['cutted_Dis']) data2 = tf2.transform(train_data['cutted_Dis']) test2 = tf2.transform(test_data['cutted_Dis']) print(data2.shape) train = hstack((data1,data2)).tocsr() test = hstack((test1,test2)).tocsr() return train,test,y
def hash_feature(data, feature, max_f): vectorizer = HashingVectorizer(n_features=max_f, stop_words='english', alternate_sign=False, norm='l1', dtype=np.float32) # testing make_pipeline # vectorizer = make_pipeline(hasher, TfidfTransformer()) if type(data) == np.ndarray: vectorizer.fit(data[:, feature]) features_vec = vectorizer.transform(data[:, feature]) else: vectorizer.fit(data[feature]) features_vec = vectorizer.transform(data[feature]) return vectorizer, features_vec
def fit_vectorizer(data, embedding_size, max_len, PAD): """用数据训练一个向量化器""" vectorizer = HashingVectorizer(n_features=embedding_size, analyzer='char', lowercase=False) words = [PAD] for sentences in X_train: t = list(sentences) if len(t) > max_len: t = t[:max_len] pad_size = max_len - len(t) if pad_size > 0: t = t + [PAD] * pad_size words += t vectorizer.fit(words) return vectorizer
def build_vectors(): text = [] print("Cleaning data") for idx, row in df.iterrows(): cleaned = pipeline(row['title'] + ' ' + row['abstract']) text.append(cleaned) print("Building vectors") hv = HashingVectorizer(n_features=2**10) hv.fit(text) X = hv.transform(text) print("Saving") save_npz(VECTORS_F, X) pickle.dump(hv, open(MODEL_F, 'wb+')) return X, hv
def hash_vector(features, ngram=(1, 1), n_features=1048576, **kwargs): vectorizer = HashingVectorizer(analyzer='word', ngram_range=ngram, stop_words='english', norm='l2', non_negative=True, lowercase=True, n_features=n_features) fitted = vectorizer.fit(features) return fitted.transform(features), fitted
def tfidf_process_zi(data, train_data, test_data): data = cut_zi(data) train_data = cut_zi(train_data) test_data = cut_zi(test_data) y = train_data['Score'] tf1 = TfidfVectorizer(ngram_range=(1, 6), analyzer='char') tf1.fit(data['cut_zi']) data1 = tf1.transform(train_data['cut_zi']) test1 = tf1.transform(test_data['cut_zi']) print(data1.shape) tf2 = HashingVectorizer(ngram_range=(1, 2), lowercase=False) tf2.fit(data['cut_zi']) data2 = tf2.transform(train_data['cut_zi']) test2 = tf2.transform(test_data['cut_zi']) print(data2.shape) train = hstack((data1, data2)).tocsr() test = hstack((test1, test2)).tocsr() return train, test, y
def setup(): print("Configuring the logger") logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) print("Connecting to Mongo") client = MongoClient(config.DB_HOST, config.DB_PORT) db = client[config.DB_NAME] print("Loading the models") doc2vec = models.Doc2Vec.load( "../models/tweet_model_doc2vec_v2_300_new.bin") twitterCollection = db["tweet_leisure"] dictionaryCollection = db["dictionary"] vectorizer = HashingVectorizer(stop_words='english', ngram_range=(1, 1)) documents = list(twitterCollection.find()) documents = list(map(lambda x: (' '.join(x["tokens"])), documents)) vectorizer.fit(documents) return dictionaryCollection, twitterCollection, doc2vec, vectorizer
def CreateRpeFeature(self, look, test=False, verbose=False): if not test: vectorizer = HashingVectorizer(n_features=2**8, ngram_range=(1, 2)) vectorizer.fit(self.fulldata_words['rpe'].values) self.rpe_vectorizer = vectorizer def create_rpe_features(g): rpe = g[((g["word_num"] - g["target_word_num"]).abs() <= look) & ~(g["word_num"] == g["target_word_num"])]['rpe'].values return " ".join(rpe) rpe_sentences = self.fulldata_words.groupby("sentence_num").apply( create_rpe_features) if test: return rpe_sentences.apply(lambda x: pd.Series( data=self.rpe_vectorizer.transform([x]).toarray()[0], index=[f"rpe_hash_{k}" for k in range(vectorizer.n_features)])) else: return rpe_sentences.apply(lambda x: pd.Series( data=vectorizer.transform([x]).toarray()[0], index=[f"rpe_hash_{k}" for k in range(vectorizer.n_features)]))
def train_model(X_train): news_df = X_train hash_author = HashingVectorizer(ngram_range=(3, 7), analyzer="char", alternate_sign=False) hash_title = HashingVectorizer(ngram_range=(3, 7), analyzer="char", alternate_sign=False) hash_text = HashingVectorizer(ngram_range=(3, 7), analyzer="word", alternate_sign=False) X_text = news_df[:, 2] hash_text.fit(X_text) X = hash_text.fit_transform(X_text) X_title_text = news_df[:, 0] X2 = hash_title.fit_transform(X_title_text) X_author = news_df[:, 1] X3 = hash_author.fit_transform(X_author) print('vectorized') pickle_path1 = os.path.join("resources", "X_text_matrix.pkl") pickle_path2 = os.path.join("resources", "X_title_matrix.pkl") pickle_path3 = os.path.join("resources", "X_author_matrix.pkl") with open(pickle_path1, "wb") as output_file: pickle.dump(X, output_file) with open(pickle_path2, "wb") as output_file2: pickle.dump(X2, output_file2) with open(pickle_path3, "wb") as output_file3: pickle.dump(X3, output_file3) return
def save_fit_result(texts_ql_qr): marisa_count1 = MarisaCountVectorizer() marisa_tfidf1 = MarisaTfidfVectorizer() hashing1_18 = HashingVectorizer(n_features=2**20) hashing1_20 = HashingVectorizer(n_features=2**24) hashing2_18 = HashingVectorizer(ngram_range=(1,2),n_features=2**20) hashing2_20 = HashingVectorizer(ngram_range=(1,2),n_features=2**24) ql_qr_tfidf1 = marisa_tfidf1.fit(texts_ql_qr) with open('./data/yao_saver/ql_qr_tfidf1.pkl','wb') as a: pickle.dump(ql_qr_tfidf1, a) print('ql_qr_tfidf1 is ok') ql_qr_count1 = marisa_count1.fit(texts_ql_qr) with open('./data/yao_saver/ql_qr_count1.pkl','wb') as a: pickle.dump(ql_qr_count1, a) print('ql_qr_count1 is ok') ql_qr_hash1_18 = hashing1_18.fit(texts_ql_qr) with open('./data/yao_saver/ql_qr_hash1_18.pkl','wb') as a: pickle.dump(ql_qr_hash1_18, a) print('ql_qr_hash1_18 is ok') ql_qr_hash2_18 = hashing2_18.fit(texts_ql_qr) with open('./data/yao_saver/ql_qr_hash2_18.pkl','wb') as a: pickle.dump(ql_qr_hash2_18, a) print('ql_qr_hash2_18 is ok') ql_qr_hash1_20 = hashing1_20.fit(texts_ql_qr) with open('./data/yao_saver/ql_qr_hash1_20.pkl','wb') as a: pickle.dump(ql_qr_hash1_20, a) print('ql_qr_hash1_20 is ok') ql_qr_hash2_20 = hashing2_20.fit(texts_ql_qr) with open('./data/yao_saver/ql_qr_hash2_20.pkl','wb') as a: pickle.dump(ql_qr_hash2_20, a) print('ql_qr_hash2_20 is ok')
def hash_vector(features, ngram=(1,1), n_features=1048576, **kwargs): vectorizer = HashingVectorizer( analyzer='word', ngram_range=ngram, stop_words = 'english', norm='l2', non_negative=True, lowercase=True, n_features=n_features ) fitted = vectorizer.fit( features ) return fitted.transform( features ), fitted
def Hashvect(train_int, test_int=None, Ngram_min=1, Ngram_max=1): import pandas as pd from sklearn.feature_extraction.text import HashingVectorizer def toktotxt(txt_int): if isinstance(txt_int[0], list): text = txt_int.apply(lambda x: " ".join(str(i) for i in x)) else: text = txt_int return text train_txt = toktotxt(train_int) vectorizer = HashingVectorizer(ngram_range=(Ngram_min, Ngram_max)) vectorizer.fit(train_txt) X = vectorizer.transform(train_txt) train = X.toarray() if test_int is None: out = train else: test_txt = toktotxt(test_int) Y = vectorizer.transform(test_txt) test = Y.toarray() out = train, test return out
def vectorize_data(data, n_feartures=50000): """ :param data: a list of tweets :param n_feartures: int() :return: a matrix """ vectorizer = HashingVectorizer(non_negative=True, binary=False, norm=None, ngram_range=(1, 2), analyzer='word', n_features=n_feartures) data_vec = vectorizer.fit(data) return data_vec
plotly.offline.init_notebook_mode(connected=True) import itertools import matplotlib.pyplot as plt labels = labels() review_tokens = all_reviews() # vectorizer for organizing training/testing data count_vect = CountVectorizer() tf_vect = TfidfVectorizer() hash_vect = HashingVectorizer() # tokens must be a list of full reviews count_vect.fit(review_tokens) tf_vect.fit(review_tokens) hash_vect.fit(review_tokens) # split data into training and test set with train_test_split function - setting shuffle to True because, we # have pos reviews on the 1st half and neg on 2nd half of 'tokens' array x_train, x_test, y_train, y_test = train_test_split(review_tokens, labels, test_size=.5, random_state=1234, shuffle=True) # create svm classifier and train it count_lsvm = LinearSVC() tf_lsvm = LinearSVC() hash_lsvm = LinearSVC()
def parse_wikirfa_edges_as_search(n_features = 100): '''Parse wiki-rfa for an edge sentiment analysis experiment.''' from sklearn.feature_extraction.text import HashingVectorizer path = "%s/data/wikirfa/" % (current_dir,) n_nodes = 10926 n_edges = 176963 # number of +/- edges n_edges_read = 189004 # includes neutral edges # username -> index map constructed on the fly usermap = {} A = np.zeros((n_features + 1, n_nodes, n_nodes), dtype='float32') B = np.zeros((n_edges, n_nodes), dtype='float32') X = np.ones((n_nodes, 1) , dtype='float32') Y = np.zeros((n_edges, 2), dtype='int32') # build comment vectorizer with open(path+'all_comments.txt','r') as f: vectorizer = HashingVectorizer('content', n_features=n_features) vectorizer.fit(f.xreadlines()) u = 0 v = 0 resmap = {-1: 0, 1: 1} votemap = {-1: 0, 1: 1} with open(path + 'wiki-RfA.txt','r') as f: for i in range(n_edges_read): # Read in the entry # SRC:Guettarda tail = f.readline().strip()[4:] # TGT:Lord Roem head = f.readline().strip()[4:] # VOT:1 vote = int(f.readline().strip()[4:]) # RES:1 res = int(f.readline().strip()[4:]) # YEA:2013 year = int(f.readline().strip()[4:]) #DAT:19:53, 25 January 2013 date = f.readline().strip()[4:] # TXT:'''Support''' per [[WP:DEAL]]: clueful, and unlikely to break Wikipedia. txt = f.readline().strip()[4:] # kill blank line f.readline() # Process the entry # index users if tail not in usermap: usermap[tail] = u u += 1 if head not in usermap: usermap[head] = u u += 1 if vote != 0: # add vote edge A[0, usermap[tail], usermap[head]] = vote # add to incidence matrix B[v, usermap[tail]] = 1 B[v, usermap[head]] = 1 # vectorize text using the hashing trick #try: # commentcount[(tail,head)] += 1 #except KeyError: # commentcount[(tail,head)] = 1 #features = np.zeros(n_features) #for token in txt.strip().split(' '): # features[hash(token) % n_features] += 1 #A[1:, usermap[tail], usermap[head]] = features A[1:, usermap[tail], usermap[head]] = np.asarray(vectorizer.transform([txt]).todense())[0] X[usermap[head],0] = resmap[res] # treat the result as the node class Y[v, votemap[vote]] = 1 v += 1 # normalize features #for edge in commentcount: # A[1:, edge[0], edge[1]] /= commentcount[edge] print u print v assert len(usermap) == n_nodes assert u == n_nodes assert v == n_edges return A, B, X, Y
enc = OneHotEncoder() for feature in one_hot_feature: enc.fit(data[feature].values.reshape(-1, 1)) train_a = enc.transform(train[feature].values.reshape(-1, 1)) test_a = enc.transform(test[feature].values.reshape(-1, 1)) train_x = sparse.hstack((train_x, train_a)) test_x = sparse.hstack((test_x, test_a)) print('one-hot prepared !') print(train_x.shape) print(test_x.shape) cv = HashingVectorizer(n_features=1000) #cv=CountVectorizer() for feature in vector_feature: cv.fit(data[feature]) train_a = cv.transform(train[feature]) test_a = cv.transform(test[feature]) train_x = sparse.hstack((train_x, train_a)) test_x = sparse.hstack((test_x, test_a)) print('cv prepared !') print(train_x.shape) print(test_x.shape) del train, test def LGB_test(train_x, train_y, test_x, test_y): print("LGB test") clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31,
def clean_text(text): word_tokens = word_tokenize(text) filtered_sentence = [] for w in word_tokens: if w not in stop_words: filtered_sentence.append(w) string = ' '.join(filtered_sentence) return string # In[10]: # Function to predict if an input string is likely to be in top journal # note: copy/pasted from Econ_machineLearn.ipynb hash_vectorizer = HashingVectorizer(analyzer='word', ngram_range=(1, 2)) hash_vectorizer.fit(X_train) def model_predict(s): string = [] string.append(s) vectorized = hash_vectorizer.transform(string) probab = round(max(clf2.predict_proba(vectorized)[0]) * 100, 2) prediction = clf2.predict(vectorized)[0] if prediction == 1: result = "Predicted to be in the top 20 Economics journals" else: result = "Predicted to NOT be in the top 20 Economics journals" return result + " with a probability of " + str(probab) + "%."
if word not in set(stopwords.words('english'))) return text df['Sentence'] = df['Sentence'].apply(clean_text) X_core = df['Sentence'].values midway = int(X_core.shape[0] / 2) ###################### from sklearn.feature_extraction.text import HashingVectorizer hashing_vect = HashingVectorizer(n_features=15000) fitted_vect = hashing_vect.fit(X_core[:midway]) with open('hashing_fitted_vect.pickle', 'wb') as fin: pickle.dump(fitted_vect, fin) X_hash = fitted_vect.transform(X_core[:midway]).toarray() X_train, X_test, y_train, y_test = train_test_split( X_hash, pd.get_dummies(df['Risk_Factor'][:midway]).values, test_size=0.2, random_state=42) RandomProjection = random_projection.GaussianRandomProjection( n_components=4000) X_train = RandomProjection.fit_transform(X_train) X_test = RandomProjection.transform(X_test)
N_FEATURES = [5000, 10000] stopwords = nltk.corpus.stopwords STOPWORDS = stopwords.words('english') + list(string.punctuation) + [ '``', '...', '--', ',', ':', 'br', '\'s', '\'', 'n\'t', '\'\'' ] DATA_FOLDER = './processed/' CORPUS = pd.read_pickle(DATA_FOLDER + 'pd.DF.train_both.pickle')['text'] for n_gram in N_GRAM: for n_features in N_FEATURES: # create the hashing vectorizer hv = HashingVectorizer(stop_words=STOPWORDS, n_features=n_features, ngram_range=n_gram) hv.fit(CORPUS) # create a pipeline for tfidf calculations hv2 = HashingVectorizer(stop_words=STOPWORDS, n_features=n_features, norm=None, ngram_range=n_gram) tf = TfidfTransformer() tfidf = Pipeline([("hash", hv2), ("tf", tf)]) tfidf.fit(CORPUS) # transform all the data we have for dataset in ['train', 'test']: neg_corpus = pd.read_pickle(DATA_FOLDER + 'pd.DF.' + dataset + '_neg.pickle')['text']
stop_words = nltk.corpus.stopwords.words('english') + list(string.punctuation) vectorizer = HashingVectorizer(norm='l1') #this worked best table = str.maketrans('', '', string.punctuation) training_words = get_text_features(train_data) test_words = get_text_features(test_data) all_words = training_words + test_words train_text = list() test_text = list() for item in training_words: text = " ".join(item) train_text.append(text) for item in test_words: text = " ".join(item) test_text.append(text) vectorizer.fit(train_text +test_text) X_train_text_features = vectorizer.transform(train_text) X_test_text_features= vectorizer.transform(test_text) print ("Concatenating all features.....") X_train_final = sp.hstack([X_train_categorical,X_train_text_features.astype(float)]) X_test_final = sp.hstack([X_test_categorical,X_test_text_features.astype(float)]) #print (X_train_final.shape)
def parse_wikirfa(n_features = 100): '''Parse wiki-rfa for an edge sentiment analysis experiment.''' #from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import HashingVectorizer path = "%s/data/wikirfa/" % (current_dir,) n_nodes = 10926 n_edges = 176963 # number of +/- edges n_edges_read = 189004 # includes neutral edges # username -> index map constructed on the fly usermap = {} A = np.zeros((n_nodes, n_nodes), dtype='float32') B = np.zeros((n_edges, n_nodes), dtype='float32') X_N = np.ones((n_nodes, 1) , dtype='float32') X_E = np.ones((n_edges, n_features) , dtype='float32') Y = np.zeros((n_edges, 2), dtype='int32') # build comment vectorizer with open(path+'all_comments.txt','r') as f: #vectorizer = CountVectorizer('content', max_features=n_features) vectorizer = HashingVectorizer('content', n_features=n_features) vectorizer.fit(f.xreadlines()) u = 0 v = 0 resmap = {-1: 0, 1: 1} votemap = {-1: 0, 1: 1} with open(path + 'wiki-RfA.txt','r') as f: for i in range(n_edges_read): # Read in the entry # SRC:Guettarda tail = f.readline().strip()[4:] # TGT:Lord Roem head = f.readline().strip()[4:] # VOT:1 vote = int(f.readline().strip()[4:]) # RES:1 res = int(f.readline().strip()[4:]) # YEA:2013 year = int(f.readline().strip()[4:]) #DAT:19:53, 25 January 2013 date = f.readline().strip()[4:] # TXT:'''Support''' per [[WP:DEAL]]: clueful, and unlikely to break Wikipedia. txt = f.readline().strip()[4:] # kill blank line f.readline() # Process the entry # index users if tail not in usermap: usermap[tail] = u u += 1 if head not in usermap: usermap[head] = u u += 1 if vote != 0: # add vote edge A[usermap[tail], usermap[head]] = vote A[usermap[head], usermap[tail]] = vote # add to incidence matrix B[v, usermap[tail]] = 1 B[v, usermap[head]] = 1 X_N[usermap[head],0] = resmap[res] X_E[v, :] = np.asarray(vectorizer.transform([txt]).todense())[0] Y[v, votemap[vote]] = 1 v += 1 print u print v assert len(usermap) == n_nodes assert u == n_nodes assert v == n_edges return A, B, X_N, X_E, Y
# efficient in case of large datasets vectorizer = HashingVectorizer(stop_words='english') # In case of HashingVectorizer we don't need to fit # the data, just transform would work. X_train = vectorizer.transform(data_train.data) X_test = vectorizer.transform(data_test.data) elif feature_extractor_type == "count": # The other vectorizer we can use is CountVectorizer with # binary=True. But for CountVectorizer we need to fit # transform over both training and test data as it # requires the complete vocabulary to create the matrix vectorizer = CountVectorizer(stop_words='english', binary=True) # First fit the data vectorizer.fit(data_train.data + data_test.data) # Then transform it X_train = vectorizer.transform(data_train.data) X_test = vectorizer.transform(data_test.data) # alpha is additive (Laplace/Lidstone) smoothing parameter (0 for # no smoothing). clf = BernoulliNB(alpha=.01) # Training the classifier clf.fit(X_train, y_train) # Predicting results y_predicted = clf.predict(X_test) score = metrics.accuracy_score(y_test, y_predicted)
del artist train.song_hotttnesss[train.song_hotttnesss > train.song_hotttnesss.mean()] = 1 train.song_hotttnesss[train.song_hotttnesss < train.song_hotttnesss.mean()] = 0 train.year = (train.year // 10) * 10 test.year = (test.year // 10) * 10 CategoricalFeatures = train[['artist_id', 'title', 'audio_md5']] from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer vectorizer = HashingVectorizer(n_features=750) #vectorizer = TfidfVectorizer(min_df = 0.0002) TfidfVectorizerObject = vectorizer.fit(pd.concat([train.title, test.title])) CountVectorizerTrainData = TfidfVectorizerObject.transform(train["title"]) CountVectorizerTestData = TfidfVectorizerObject.transform(test["title"]) DropFeatures = [ 'song_id', 'artist_id', 'title', 'audio_md5', 'analysis_sample_rate', 'key_confidence', 'audio_md5', 'year', 'end_of_fade_in', 'duration', 'time_signature_confidence', 'artist_latitude', 'artist_longitude' ] trainSongId = train[['song_id']] train = train.drop(DropFeatures, axis=1) song_id = test['song_id'] test = test.drop(DropFeatures, axis=1) train = pd.concat( [train, pd.DataFrame(CountVectorizerTrainData.toarray())], axis=1)
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)' # initialize TFIDF vectorizer and Hashing Vectorizer tfidf = TfidfVectorizer(token_pattern=TOKENS_ALPHANUMERIC, ngram_range=(1, 2), max_df=1.0, min_df=10, stop_words='english') hsv = HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC, stop_words='english') # fit tfidf and hashing vectorizer to train data print("Feature Extraction") tfidf.fit(x_) hsv.fit(x_) # transform X_tfidf = tfidf.transform(x_) X_test_tfidf = tfidf.transform(test_x_) X_hsv = hsv.transform(x_) X_test_hsv = hsv.transform(test_x_) # combine X = sparse.hstack((X_hsv, X_tfidf)) X_test = sparse.hstack((X_test_hsv, X_test_tfidf)) ###################################################### # Using SGDClassifier to build 27 models. Each class # (9 total) will have 3 models - varying some of the
dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=None, strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None) # Creates a df that lists numbers of uniques and the unique 'string' df = pd.DataFrame(cvec.transform([spam]).todense(), columns=cvec.get_feature_names()) df.transpose().sort_values(0, ascending=False).head(10).transpose() # Hash Vectorizer, more for big data from sklearn.feature_extraction.text import HashingVectorizer hvec = HashingVectorizer() hvec.fit([spam]) df = pd.DataFrame(hvec.transform([spam]).todense()) df.transpose().sort_values(0, ascending=False).head(10).transpose() # Breaks up sentences and puts them into an array from nltk.tokenize import PunktSentenceTokenizer easy_text = "I went to the zoo today. What do you think of that? I bet you hate it! Or maybe you don't" sent_detector = PunktSentenceTokenizer() sent_detector.sentences_from_text(easy_text) """ Out[6]: ['I went to the zoo today.', 'What do you think of that?', 'I bet you hate it!',