def jiebaCounter(max_features=5000,prefix="extraction-",begin=1, end=1,dictionary=""): # get stopwords sf = open('chi_,.txt','r') stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')] if dictionary=="": vectorizer=cv(max_features=max_features,stop_words=stopwords)#tokenizer=tokenizer) else: vocabulary=open(dictionary,'r').read().split("\n") vectorizer=cv(vocabulary=vocabulary,max_features=max_features,stop_words=stopwords)#tokenizer=tokenizer) d={} st=time.time() d,txt=getText(prefix=prefix,begin=begin,end=end) getdatatime=time.time() print getdatatime-st corpus={} for i in range(len(txt)):#d.items(): #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False))) corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False))) vect=vectorizer.fit_transform(corpus.values()).toarray() print vect.shape voc=vectorizer.get_feature_names() wordssum = vect.sum(axis=0) index=range(len(voc)) index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True) if x not in stopwords] print time.time() - st voc_sorted = [voc[i] for i in index] print time.time()-getdatatime return vect,voc,txt
def version2(): # Data cleaning in NLP Model corpus = [] for i in range(0, 527383): review = re.sub( '[^a-zA-Z]', ' ', df.iloc[i, 1]) # Removing all elements except words from all reviews review = review.lower() review = review.split() review = [ word for word in review if not word in set(sw.words('english')) ] stammer = ps() review = [stammer.stem(word) for word in review] review = " ".join(review) corpus.append(review) features = cv().fit_transform(corpus) labels = df.iloc[:, -1] train_test_split(features, labels, 100) features_test_vectorized = cv().transform(features_test) features_train_vectorized = cv().fit_transform(features_train) model = lr().fit(features_train_vectorized, labels_train) predictions = model.predict(features_test_vectorized) ras(labels_test, predictions) cm(labels_test, predictions) return model
def version1(): # Logistic Regression Model train_test_split(df["reviewText"], df["Positivity"], 100) features_train_vectorized = cv().fit_transform(features_train) features_test_vectorized = cv().transform(features_test) model = lr().fit(features_train_vectorized, labels_train) # Model creation for logistic regression predictions = model.predict(features_test_vectorized) ras(labels_test, predictions) # Generating prediction score cm(labels_test, predictions) return model
def news_iterator_raw(input_size,batchsize): # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', ] print("Loading 20 newsgroups dataset for 20 categories.") #print(categories) traindata = fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'),categories=None) tfidf_vectorizer = cv(max_df=0.95, min_df=2, max_features=input_size, stop_words='english') train = tfidf_vectorizer.fit_transform(traindata.data) # bag of words vocabulary=tfidf_vectorizer.get_feature_names() # feature name x = train.astype(np.float64).toarray() train_x = x / (np.sum(x, axis=1)[:, None] + 1e-10) train_y = traindata.target testdata = fetch_20newsgroups(subset='test', categories=None, remove=('headers','footers','quotes')) test_features=tfidf_vectorizer.transform(testdata.data) x = test_features.astype(np.float64).toarray() test_x = x / (np.sum(x, axis=1)[:, None] + 1e-10) test_y = testdata.target return (train_x, train_y, test_x, test_y, vocabulary)
def main(): traindata = (p.read_table('train.tsv')) tr_title, tr_body, tr_url = convert_text(traindata) testdata = list(np.array(p.read_table('test.tsv'))[:,2]) y = np.array(p.read_table('train.tsv'))[:,-1] wordCount = cv(stop_words = 'english', encoding='latin-1') wordTFIDF = tfidf(stop_words = 'english', encoding='latin-1') corpus = tr_body bag = wordCount.fit_transform(corpus) tfdif = wordTFIDF.fit_transform(corpus) tfdif = tfdif.toarray() kmeans_soln.getDender(bag, tr_title) titles = np.array(tr_title) vocab = wordCount.get_feature_names() vocabTF = wordTFIDF.get_feature_names() topWords(centers, vocab)
def main(): traindata = (p.read_table('train.tsv')) tr_title, tr_body, tr_url = convert_text(traindata) testdata = list(np.array(p.read_table('test.tsv'))[:, 2]) y = np.array(p.read_table('train.tsv'))[:, -1] wordCount = cv(stop_words='english', encoding='latin-1') wordTFIDF = tfidf(stop_words='english', encoding='latin-1') corpus = tr_body bag = wordCount.fit_transform(corpus) tfdif = wordTFIDF.fit_transform(corpus) tfdif = tfdif.toarray() kmeans_soln.getDender(bag, tr_title) titles = np.array(tr_title) vocab = wordCount.get_feature_names() vocabTF = wordTFIDF.get_feature_names() topWords(centers, vocab)
def train(self): """ Trains the model based on movies; title, genres, tag """ for attribute in ['title', 'genres', 'actorName', 'directorName']: self.movies[attribute] = self.movies[attribute] self.movies['merged'] = self.movies.apply(self.merge, axis=1) count_vectorized = cv() cs = cosine_similarity( count_vectorized.fit_transform(self.movies['merged'])) recommended_movies = list( enumerate(cs[self.get_movie_id(self.watched_movie)])) if recommended_movies: predicted = self.get_highest(recommended_movies) for i, row in self.movies.iterrows(): if predicted[0] == i: print('\nSince you\'ve liked', self.watched_movie, 'We recommend: ', row['title'], 'genres:', row['genres']) print('Accuracy', predicted[1]) if i == 999: print(self.watched_movie, 'movie\'s genre:', row['genres']) print() else: print('Something went wrong with the analysis')
def news_iterator(input_size,batchsize=100,alldata=True,label="y"): # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', ] # load train data if(alldata): print("Loading 20 newsgroups dataset for 20 categories.") traindata = fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'),categories=None) else: print("Loading 20 newsgroups dataset for 2 categories.") traindata = fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'),categories=categories) # preprocessing words=traindata.data train_words=[] for i in range(0,len(traindata.data)): train_words.append( raw_to_words(traindata.data[i]) ) # train iterator vectorizer=cv(analyzer="word",max_features=input_size, stop_words='english') train_features=vectorizer.fit_transform(train_words).toarray() vocabulary=vectorizer.get_feature_names() # feature name x = train_features.astype(np.float64) X_normalized = x / (np.max(x, axis=1)[:, None] + 1e-10) #ss = pp.StandardScaler(with_mean=False).fit(x) #X_normalized = ss.transform(x) y = traindata.target if(label=="x"): train_dataiter = mx.io.NDArrayIter(data=X_normalized, label = X_normalized, batch_size=batchsize, shuffle = True) else: train_dataiter = mx.io.NDArrayIter(data=X_normalized, label = y, batch_size=batchsize, shuffle = True) # load test data if alldata : testdata = fetch_20newsgroups(subset='test', categories=None, remove=('headers','footers','quotes')) else: testdata = fetch_20newsgroups(subset='test', categories=categories, remove=('headers','footers','quotes')) test_words=[] for i in range(0,len(testdata.data)): test_words.append( raw_to_words(testdata.data[i]) ) # test iterator test_features=vectorizer.transform(test_words).toarray() x = test_features.astype(np.float64) #X_normalized = ss.transform(x) X_normalized = x / (np.max(x, axis=1)[:, None] + 1e-10) y = testdata.target if(label=="y"): val_dataiter = mx.io.NDArrayIter(data=X_normalized, label = y, batch_size=batchsize, shuffle = True) else: val_dataiter = mx.io.NDArrayIter(data=X_normalized, label = X_normalized, batch_size=batchsize, shuffle = True) return (train_dataiter, val_dataiter,vocabulary)
def transformer_array_vsm(tweet_list): # 向量空间模型下的词向文档矩阵 corpus = [] for tweet in tweet_list: corpus.append(' '.join(tweet.get_word_list())) vectorizer = cv() transformer = tt() tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) array = tfidf.toarray() word_list = vectorizer.get_feature_names() return array, word_list
def jiebaCounter(max_features=5000, prefix="extraction-", begin=1, end=1, dictionary=""): # get stopwords sf = open('chi_,.txt', 'r') stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')] if dictionary == "": vectorizer = cv(max_features=max_features, stop_words=stopwords) #tokenizer=tokenizer) else: vocabulary = open(dictionary, 'r').read().split("\n") vectorizer = cv(vocabulary=vocabulary, max_features=max_features, stop_words=stopwords) #tokenizer=tokenizer) d = {} st = time.time() d, txt = getText(prefix=prefix, begin=begin, end=end) getdatatime = time.time() print getdatatime - st corpus = {} for i in range(len(txt)): #d.items(): #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False))) corpus[i] = (' '.join(jieba.cut(txt[i], cut_all=False))) vect = vectorizer.fit_transform(corpus.values()).toarray() print vect.shape voc = vectorizer.get_feature_names() wordssum = vect.sum(axis=0) index = range(len(voc)) index = [ index for (y, x, index) in sorted(zip(wordssum, voc, index), reverse=True) if x not in stopwords ] print time.time() - st voc_sorted = [voc[i] for i in index] print time.time() - getdatatime return vect, voc, txt
def cossim(doc1, doc2): from sklearn.metrics.pairwise import cosine_similarity as cs from sklearn.feature_extraction.text import CountVectorizer as cv x = [doc1, doc2] vectorizer = cv().fit_transform(x) vectors = vectorizer.toarray() a = vectors[0].reshape(1, -1) b = vectors[1].reshape(1, -1) similarity_score = cs(a, b) return similarity_score
def bayes(frame, size, occmin): # Filtering dataset to occupations occurring more than n times unfiltq = getqcols(frame) firstq = frame.columns.get_loc(unfiltq[0]) tofilter = frame.iloc[:, firstq:] overmin = tofilter.iloc[:, ( frame.iloc[:, firstq:].sum() > occmin).values].columns frame = frame.loc[:, frame.columns[:firstq].append(overmin)] filtq = getqcols(frame) firstq = frame.columns.get_loc(filtq[0]) # Text cleaning frame['text'] = frame['text'].apply(nostops).apply( lambda i: ' '.join(i)).apply(cleaner) frame.insert( frame.columns.get_loc(filtq[0]), 'sets', frame['text'].apply(lambda i: set(re.findall("[a-z]{3,}", i)))) frame['sets'] = [([a for a in x if a not in stoplist]) for x in frame['sets']] frame['sets'] = list(map(set, frame['sets'])) frame['sets'] = frame.sets.apply(lambda i: ' '.join(i)) # Vectorizer, fit, df data vec = cv(stop_words='english', max_features=30000) vec.fit(frame.sets) X = vec.transform(frame.text).toarray() acc = [] rec = [] pre = [] cts = [] totalcts = [] cm = [] label_used = [] auc = [] # Bayes loops for i in filtq: X_train, X_test, y_train, y_test = train_test_split(X, frame.loc[:, i], test_size=size, random_state=0) gb = GaussianNB() gb.fit(X_train, y_train) pred = gb.predict(X_test) try: output = metrics.classification_report(y_test, pred, output_dict=True, zero_division=0)['0'] auc_score = metrics.roc_auc_score(y_test, pred) a = 0 except: output = metrics.classification_report(y_test, pred, output_dict=True, zero_division=0)['1'] a = 1 auc_score = 'N/A' label_used.append(str(a)) acc.append("{:0.3}".format(metrics.accuracy_score(y_test, pred))) pre.append("{:0.3}".format(output['precision'])) rec.append("{:0.3}".format(output['recall'])) cts.append(y_test.sum()) cm.append(metrics.confusion_matrix(y_test, pred)) auc.append(auc_score) totalcts.append(sum(frame.loc[:, i])) d = { 'label': label_used, 'acc': acc, 'pre': pre, 'rec': rec, 'auc': auc, 'cts': cts, 'totalcts': totalcts, 'conf': cm } df = pd.DataFrame(data=d, index=filtq) return df
""" bag of words vectorizor """ from sklearn.feature_extraction.text import CountVectorizer as cv string1 = 'Hey Brian, go get me sone water best, Tony' string2 = 'Dearest Tony, go suck a c**k. Bye, Brian' string3 = 'Morning Brian, you little bitch, I am gonna report you. From, Tony' #you need to put the emails in a list to vectorize them emails = [string1, string2, string3] #assigns the classifier vectorize = cv() #fits the data and transform (similar to predict) the emails bag_of_words = vectorize.fit(emails) bag_of_words = vectorize.transform(emails) print bag_of_words """ How to read bag of words (2,20) 2 (string3, word number 20) #numer of occurance """ # this prints what feature number a word is print vectorize.vocabulary_.get('you')
import pandas as pd import numpy as np from sklearn.feature_extraction.text import CountVectorizer as cv from sklearn.metrics.pairwise import cosine_similarity as cs cv = cv() # a = np.array(4000) import random def find_title(index): try: return df[df.index == index]["title"].values[0] except: pass def find_ref(title): try: return df[df.title == title]["index"].values[0] except: return 1932 df = pd.read_csv("movie.csv") columns = df.columns for column in columns: df[column] = df[column].fillna('') df[column] = df[column].dropna() def recommend_by_feature(row):
findElbow(x_iris) # In[45]: from sklearn.feature_extraction.text import CountVectorizer as cv from sklearn.feature_extraction.text import TfidfVectorizer as tfidf from sklearn.metrics.pairwise import linear_kernel df = pd.read_pickle('articles.pkl') df.head() # In[104]: wordVector = cv(stop_words = 'english', encoding='latin-1') wordWeights = tfidf(stop_words = 'english', encoding='latin-1') corpus = df[df['section_name'] == 'Sports']['content'] corpus = corpus.append(df[df['section_name'] == 'Arts']['content']) corpus = corpus.append(df[df['section_name'] == 'Business Day']['content']) bag = wordVector.fit_transform(corpus) weightybags = wordWeights.fit_transform(corpus) # In[105]: weightybags = weightybags.toarray()
def para2words(para): para_text = bs(para).get_text() para_lettersonly = re.sub("[^a-zA-Z]", "", para_text) para_words = para_lettersonly.lower().split() stops = set(sw.words("english")) para_meaning = [n for n in para_words if not n in stops] return (" ".join(para_meaning)) num_reviews = train["review"].size clean_train = [] for i in range(0, num_reviews): clean_train.append(para2words(train["review"][i])) vector = cv(analyzer="word", tokenizer=None, preprocessor=None, max_features=5000) train_df = vector.fit_transform(clean_train) train_df = train_df.toarray() forest = rf(n_estimators=150) forest = forest.fit(train_df, train["sentiment"]) numofrev = len(test["review"]) cleanreview = [] for i in range(0, numofrev): cleanreview.append(para2words(test["review"][i])) test_df = vector.transfor(cleanreview) test_df = test_df.toarray()
res = f1_score(y_test, clf.predict(X_test), pos_label=None, average='macro') print 'f1 macro:', res print # color = cm(1. * i / NUM_COLORS) # color will now be an RGBA tuple # cm = plt.get_cmap('gist_rainbow') # fig = plt.figure(figsize=(8.0, 5.0)) # ax = fig.add_subplot(111) # # ax.set_color_cycle([cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)]) # ax.plot(range(len(scores)), scores, label=str(threshold)) # ax.text(len(scores) - 1, scores[len(scores) - 1], threshold, fontsize='smaller') # plt.show() print name return res vec_list = [tf(), cv()] clf_list = [svc(), lr()] threshold_list = np.arange(0.5, 3, 0.5) print len(threshold_list) # results_size = (len(vec_list), len(clf_list),len(threshold_list)) # results = np.zeros(results_size, dtype = np.float) # a, b, c = range(3), range(3), range(3) # def my_func(x, y, z): # return (x + y + z) / 3.0, x * y * z, max(x, y, z) grids = np.vectorize(run)(*np.ix_(threshold_list, vec_list, clf_list)) # mean_grid, product_grid, max_grid = grids print len(grids) try: print grids.shape except:
findElbow(x_iris) # In[45]: from sklearn.feature_extraction.text import CountVectorizer as cv from sklearn.feature_extraction.text import TfidfVectorizer as tfidf from sklearn.metrics.pairwise import linear_kernel df = pd.read_pickle('articles.pkl') df.head() # In[104]: wordVector = cv(stop_words='english', encoding='latin-1') wordWeights = tfidf(stop_words='english', encoding='latin-1') corpus = df[df['section_name'] == 'Sports']['content'] corpus = corpus.append(df[df['section_name'] == 'Arts']['content']) corpus = corpus.append(df[df['section_name'] == 'Business Day']['content']) bag = wordVector.fit_transform(corpus) weightybags = wordWeights.fit_transform(corpus) # In[105]: weightybags = weightybags.toarray() # In[106]:
avg = (np.mean(new_vec, axis=0)) max = (np.max(new_vec, axis=0)) min = (np.max(new_vec, axis=0)) all = np.concatenate((avg, max, min)) new_data.append(all) return new_data if __name__ == "__main__": datafolder = '/Users/claire/Dropbox/PycharmProjects/Thesis/Scripts/Data/' trainfile = datafolder + 'twitter/twitter.train' testfile = datafolder + 'twitter/twitter.dev' tr_data, tr_target = load_twitter_2class(trainfile) te_data, te_target = load_twitter_2class(testfile) vec1 = cv(analyzer='word', ngram_range=(1, 4)) vec2 = cv(analyzer='char_wb', ngram_range=(1, 4)) combined_features = FeatureUnion([("word", vec1), ("char", vec2)]) print combined_features # Use combined features to transform dataset: print 'Fit transform data' X_train = combined_features.fit_transform(tr_data) print X_train.shape X_test = combined_features.transform(te_data) # X_train = vec2.fit_transform(tr_data) # X_test = vec2.transform(te_data) print 'TRANSFORMED' for i in [log, svm]:
def reviews2words(raw_review): review_text = bs(raw_review).get_text() letters_only = re.sub("[^a-zA-z]", " ", review_text) wordlist = letters_only.lower().split() words = [w for w in wordlist if not w in stop] return " ".join(words) if __name__ == "__main__": train = pd.read_csv(".\data\labeledTrainData.tsv", header = 0, delimiter = "\t", quoting=3) clean_train = [] print 'Start cleaning reviews...\n' for i in xrange(0, train['review'].size): clean_train.append(reviews2words(train['review'][i])) print 'Creating the bag of words...\n' vectorizer = cv(analyzer = 'word', tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000) train_data_features = vectorizer.fit_transform(clean_train) train_data_features = train_data_features.toarray() # print train_data_features.shape vocab = vectorizer.get_feature_names() # print vocab forest = rf(n_estimators = 100) forest = forest.fit(train_data_features, train['sentiment']) test = pd.read_csv(".\data\\testData.tsv", header=0, delimiter="\t", quoting=3) # Create an empty list and append the clean reviews one by one num_reviews = len(test["review"]) clean_test_reviews = []
def fit(self, x): # Fit the encoder/scaler self.n = x.shape[0] self.p = x.shape[1] dt1 = pd.Series([type(x.iloc[0][kk]).__name__ for kk in range(self.p)]) dt2 = x.dtypes.astype(str).reset_index(drop=True) self.dt = pd.Series( np.where( dt1.isin(['int64', 'float64']) & dt2.isin(['int64', 'float64']), 'float', 'str')) if not all(self.dt.values == 'float'): self.dt[~(self.dt.values == 'float')] = \ np.where(x.loc[:, ~(self.dt.values == 'float')].apply(lambda x: x.str.contains('\\|', na=False).any()), 'lst',self.dt[~(self.dt.values == 'float')]) self.cn = np.array(x.columns) stopifnot(all(self.dt.isin(['float', 'lst', 'str']))) self.cidx = np.where(self.dt == 'str')[0] self.nidx = np.where(self.dt == 'float')[0] self.tidx = np.where(self.dt == 'lst')[0] stopifnot( all( np.sort(reduce(np.union1d, [self.cidx, self.nidx, self.tidx])) == np.arange(self.p))) self.iter = {'cenc': True, 'nenc': True, 'tenc': True} self.all_enc = {} ############################################################# # --- Encoder (i): Categorical/ordinal integer features --- # if len(self.cidx) > 0: self.cenc = ohe(sparse=self.sparse, dtype=self.dtype, handle_unknown='ignore', drop=None) self.cenc.categories_ = [ np.unique(x.iloc[:, kk]) for kk in self.cidx ] self.cmode = [x.iloc[:, kk].mode()[0] for kk in self.cidx] cmode_idx = np.array([ np.where(vec == mm)[0][0] for vec, mm in zip(self.cenc.categories_, self.cmode) ]) cum_idx = np.append([0], np.cumsum( [len(z) for z in self.cenc.categories_])) self.cenc.drop_idx = [] self.cenc.drop_idx_ = None self.cenc.p = cum_idx.max() - len( self.cenc.drop_idx ) # How many features after dropping most common self.cenc.cn = list( np.delete(self.cenc.get_feature_names(self.cn[self.cidx]), self.cenc.drop_idx)) self.all_enc['cenc'] = self.cenc else: self.iter['cenc'] = False ############################################### # --- Encoder (ii): Continuous numerical ---- # if len(self.nidx) > 0: if self.quantize: u_nidx = np.array( [len(x.iloc[:, kk].unique()) for kk in self.nidx]) self.nidx1 = self.nidx[u_nidx > 31] # quantize self.nidx2 = self.nidx[u_nidx <= 31] # one-hot-encode self.nenc = {'enc': {}, 'cn': {}} if len(self.nidx1) > 0: self.nenc1 = KD(n_bins=self.nbins, strategy='quantile') if not self.sparse: self.nenc1.encode = 'onehot-dense' self.nenc1.fit(x.iloc[:, self.nidx1]) self.nenc1.cn = ljoin([ cn + '_q' + pd.Series(qq).astype(str) for cn, qq in zip(self.cn[self.nidx1], [ np.arange(len(z) - 1) + 1 for z in self.nenc1.bin_edges_ ]) ]) self.nenc['enc']['nenc1'] = self.nenc1 self.nenc['cn']['nenc1'] = self.nenc1.cn if len(self.nidx2) > 0: self.nenc2 = ohe(sparse=self.sparse, handle_unknown='ignore', drop=None) self.nenc2.fit(x.iloc[:, self.nidx2]) self.nenc2.cn = self.nenc2.get_feature_names( self.cn[self.nidx2]) self.nenc['enc']['nenc2'] = self.nenc2 self.nenc['cn']['nenc2'] = self.nenc2.cn self.nenc['cn'] = ljoin(list(self.nenc['cn'].values())) self.all_enc['nenc'] = self.nenc else: self.nenc = ss(copy=False) self.nenc.mean_ = x.iloc[:, self.nidx].mean(axis=0).values self.nenc.scale_ = x.iloc[:, self.nidx].std(axis=0).values self.nenc.n_features_in_ = self.nidx.shape[0] self.nenc.p = self.nidx.shape[0] self.nenc.cn = list(self.cn[self.nidx]) self.all_enc['nenc'] = self.nenc else: self.iter['nenc'] = False ################################################ # --- Encoder (iii): Tokenize text blocks ---- # if len(self.tidx) > 0: self.tenc = dict( zip(self.cn[self.tidx], [ cv(tokenizer=lambda x: tok_fun(x), lowercase=False, token_pattern=None, binary=True) for z in range(self.tidx.shape[0]) ])) self.tenc = {'cv': self.tenc} for kk, jj in enumerate(self.cn[self.tidx]): self.tenc['cv'][jj].fit(x.loc[:, jj].astype('U')) self.tenc['p'] = sum( [len(z.vocabulary_) for z in self.tenc['cv'].values()]) self.tenc['cn'] = ljoin([ l + '_' + pd.Series(list(z.vocabulary_.keys())) for z, l in zip(self.tenc['cv'].values(), self.tenc['cv'].keys()) ]) self.all_enc['tenc'] = self.tenc else: self.iter['tenc'] = False # Store all in dictionary to iteration over self.iter self.enc_transform = { 'cenc': self.cenc_transform, 'nenc': self.nenc_transform, 'tenc': self.tenc_transform } # Get the valid categories self.tt = np.array(list(self.iter.keys()))[np.where( list(self.iter.values()))[0]] # Get full feature names cn = [] for ee in self.tt: if hasattr(self.all_enc[ee], 'cn'): cn.append(self.all_enc[ee].cn) else: cn.append(self.all_enc[ee]['cn']) cn = ljoin(cn) self.cn_transform = cn
''' This file implements the functions needed for the SVM classifier. Most of them are fairly straightforward. ''' from sklearn.feature_extraction.text import CountVectorizer as cv from sklearn import svm ''' GLobals. For some reason this is the only way to get the classifiers to work. ''' # Vectorizes the words into a format suitable for proxessing # the ngram range shows the minimum and maximum number of ngrams it takes into account. vectorizer = cv(ngram_range=(1,2),token_pattern=r'\b\w+\b', min_df = 1) svm = svm.LinearSVC() def train(train_data, labels): global svm svm.fit(train_data, labels) return ''' Returns the accuracy on a given dataset''' def score(test_data, test_labels): global svm return svm.score(test_data, test_labels) ''' Predicts the class of the tweets passed as input.''' def predict(test_data): return svm.predict(test_data) ''' Turns a set of tweets into it's corresponding vector. For more about vectorization see: http://en.wikipedia.org/wiki/Bag-of-words_model ''' def vectorize_tweets(training_set,train = True):
import seaborn as sns %matplotlib inline import string from nltk.corpus import stopwords #Removing punctuations and stopwords def function_before (mess): nopunc = [] for char in mess : if char not in string.punctuation: nopunc.append(char) nopunc=''.join(nopunc) clean = [] for word in nopunc.split(): word = word.lower() if word not in stopwords.words('english'): clean.append(word) return clean from sklearn.pipeline import Pipeline from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer as cv from sklearn.feature_extraction.text import TfidfTransformer #Using the naive bayes classifier we created an NLP model pipeline = Pipeline([('bow', cv(analyzer=function_before)), ('tfidf', TfidfTransformer()), ('classifier', MultinomialNB()), ]) pipeline.fit(data, emotional_class)
'').replace('""', '').replace( '"', '').replace('|', '') word = cleansed_w.lower() if word not in stop_words: revised_rev.append(word + ' ') test_text.append(''.join(revised_rev)) train_text = list() labels = list() n = 0 for data in train_data: train_text.append(data[1]) labels.append(data[2]) vectorizer = cv(encoding='utf-8', strip_accents='unicode', ngram_range=(1, 1), decode_error='replace') vector_data = vectorizer.fit_transform(train_text) model_selector = model_selection X_train, X_test, y_train, y_test = model_selector.train_test_split( vector_data, labels, stratify=labels, test_size=0.2) classifier = sgd(loss='hinge', penalty='l1') classifier.fit(X_train, y_train) train_scores = classifier.score(X_train, y_train) print('Unigram Results') print('Train Scores') print(train_scores) print("Accuracy: %0.2f (+/- %0.2f)" %
def __init__(self, revisoes, ngram_range=(1, 1)): self.vetorizador = cv(ngram_range=ngram_range) self.revisoes = revisoes self.vetorizar() print("\nCountVectorizer concluiu a vetorização de %s." % str(ngram_range))
header=0, delimiter='\t', quoting=3) tokens, reviews = [], [] # preprocessing data => converting reviews to token list of words for rev in range(0, train.shape[0]): t, r = review_to_words(train["review"][rev]) tokens.append(t) # token list reviews.append(r) # seperate reviews vocabulary = 5000 # max features vectorizer = cv(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=vocabulary) X = vectorizer.fit_transform(reviews).toarray() Y = train["sentiment"] validation_size = 0.20 X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split( X, Y, test_size=validation_size) #classifier = DecisionTreeClassifier() #DTC #classifier = SVC() #SVM #classifier = KNeighborsClassifier() #KNN classifier = NB(alpha=2) #alpha=0 means no laplace smoothing classifier.fit(X_train, np.array(Y_train))