def __init__(self): stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient']) self.vec = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.3, min_df=2) self.emails = read_email_bodies() # train on the given email data. self.train()
def add_stop_words(self): if self.stop_words is None: self.stop_words = list(ENGLISH_STOP_WORDS) logging.info("using default stop words") else: words = self._split_on_spaces(self.stop_words) self.stop_words = list(ENGLISH_STOP_WORDS.union(words)) logging.info("using custom stop words") logging.debug("stop words:%s" % self.stop_words)
def lda(text, n_features, n_topics, n_top_words): """ perform latent dirichlet allocation input (array): an array of strings """ # add to stop words # the word inapplicable is a result of the questionnaire stop_words = ENGLISH_STOP_WORDS.union(['inapplicable']) tf_vectorizer = CountVectorizer(max_df=0.85, min_df=0., max_features=n_features, stop_words=stop_words) tf = tf_vectorizer.fit_transform(text) model = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) model.fit(tf) tf_feature_names = tf_vectorizer.get_feature_names() tops = get_top_words(model, tf_feature_names, n_top_words) return tops
def fit(self): pass def transform(self): matrix = self.vectorizer.transform(self.text) self.X = self.dim_reducer.transform(matrix.toarray()) def predict(self): labels = self.model.predict(self.X) self.X = np.column_stack((self.X, labels)) return labels def find_closest_beer_names(self): pass def recommend(self, user_input): pass def no_number_preprocessor(tokens): r = re.sub('(\d)+', '', tokens.lower()) return r stop_words = ENGLISH_STOP_WORDS.union({'king','german','brau','james',\ 'brewery','company','brewing','house','bock','style','scotch','california','oktoberfest',\ 'wee','special','english','american','hefeweizen','old','common','gose','NUM'}) if __name__ == '__main__': df = load_data()
df_eng_all_hl = pd.DataFrame({ "Headlines": eng_all_hl, "Date": all_headlines.iloc[:, 1], 'Publisher': all_headlines.iloc[:, 2] }) #Overview of the Sentiment from wordcloud import WordCloud import matplotlib.pyplot as plt from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS all_hl_lists = svd_headline_list + ex_headline_list + dn_headline_list + af_headline_list + metro_headline_list joint_headlines = ','.join(eng_all_hl) my_stop_words = ENGLISH_STOP_WORDS.union( ['sweden', 'swedish', 'new', 'best', 'want', 'does', 'dn']) my_cloud = WordCloud(background_color='white', stopwords=my_stop_words).generate(joint_headlines) plt.imshow(my_cloud, interpolation='bilinear') plt.axis("off") plt.show() ''' #Tokenizing from nltk import word_tokenize import nltk nltk.download('punkt') word_tokens = [word_tokenize(review) for review in df_eng_all_hl.headlines] cleaned_tokens = [[word for word in item if word.isalpha()] for item in word_tokens] list_cleaned_tokens = []
from nltk.stem.snowball import SnowballStemmer from params import mlp_params, svm_params, log_params import pandas as pd import pickle import nltk import re import eda TOP_K_FEATURES = 20000 NEG_REGEX = re.compile(r"^(\w*?n't|no(t)?|never$)", re.I) WORD_REGEX = re.compile(r"^[a-zA-Z_']+$", re.I) STEMMER = SnowballStemmer('english') STOP_WORDS = frozenset( ENGLISH_STOP_WORDS.union(['movie', 'film']).difference(['not', 'never', 'no'])) ALGS_METRICS = {} models = [ ('MNB', MultinomialNB(), None), ('LogReg', LogisticRegression(), log_params), ('SVM', LinearSVC(), svm_params), # ('MLP', MLPClassifier(), mlp_params), ('DT', DecisionTreeClassifier(), None) ] def preprocess_raw_text(raw_review): """ negates appropriate words removes stop words
ax.set_title("cluster = " + str(df.label), fontsize=16) ax.ticklabel_format(axis='x', style='sci', scilimits=(-2, 2)) ax.barh(x, df.score, align='center', color='#7530FF') ax.set_yticks(x) ax.set_ylim([-1, x[-1] + 1]) yticks = ax.set_yticklabels(df.features) plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52) plt.show() from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS stopwords = ENGLISH_STOP_WORDS.union( ['ect', 'hou', 'com', 'recipient', 'dell', 'hi', 'hello', 'nikitha']) vect = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.3, min_df=2) X = vect.fit_transform(df1.Body) features = vect.get_feature_names() # Now we print the top terms across all documents. print(top_mean_feats(X, features, None, 0.1, 20)) df1[1:10] train = df1.sample(frac=0.8, random_state=200)
def main(): stop_words = set(STOPWORDS) stop_words.update(ENGLISH_STOP_WORDS) #extra stop words extra_words=["said","say","seen","come","end","came","year","years","new","saying"] stop_words = ENGLISH_STOP_WORDS.union(extra_words) df = pd.read_csv('train_set.csv', sep='\t') cat_politics = [] cat_film = [] cat_football = [] cat_business = [] cat_technology = [] #store the content for each category for index in range(len(df.Category)): cat = df.Category[index] if cat == "Politics": cat_politics.append(df.Content[index]) elif cat == "Film": cat_film.append(df.Content[index]) elif cat == "Football": cat_football.append(df.Content[index]) elif cat == "Business": cat_business.append(df.Content[index]) elif cat == "Technology": cat_technology.append(df.Content[index]) str_pol = ''.join(cat_politics) str_fil = ''.join(cat_film) str_foo = ''.join(cat_football) str_bus = ''.join(cat_business) str_tec = ''.join(cat_technology) #produce wordcloud for each category cloud = WordCloud(background_color="white", mode = "RGB", stopwords = stop_words, width=1920, height=1080) w = cloud.generate(str_pol) plt.figure() plt.title("Politics") plt.imshow(w) plt.axis("off") plt.savefig('Politics.png') w = cloud.generate(str_fil) plt.figure() plt.title("Film") plt.imshow(w) plt.axis("off") plt.savefig('Film.png') w = cloud.generate(str_foo) plt.figure() plt.imshow(w) plt.title("Football") plt.axis("off") plt.savefig('Football.png') w = cloud.generate(str_bus) plt.figure() plt.imshow(w) plt.title("Business") plt.axis("off") plt.savefig('Business.png') w = cloud.generate(str_tec) plt.figure() plt.imshow(w) plt.title("Technology") plt.axis("off") plt.savefig('Technology.png')
# Code used in part 2 of How I used machine learning to classify emails and turn them into insights. from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS from sklearn.metrics.pairwise import linear_kernel import pandas as pd from helpers import parse_into_emails from query import EmailDataset # Just like in part_1, read and preprocess emails emails = pd.read_csv('split_emails.csv') email_df = pd.DataFrame(parse_into_emails(emails.message)) email_df.drop(email_df.query("body == '' | to == '' | from_ == ''").index, inplace=True) stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient']) vec = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.3, min_df=2) vec_train = vec.fit_transform(email_df.body) # print out the vector of the first email # print(vec_train[0:1]) # Find cosine similarity between the first email and all others. cosine_sim = linear_kernel(vec_train[0:1], vec_train).flatten() # print out the cosine similarities # print(cosine_sim) # Finding emails related to a query. query = "john" # Transform the query into the original vector vec_query = vec.transform([query])
tokenizer = TweetTokenizer() tokens = tokenizer.tokenize(text) return tokens def expandContractions(word): if word in c_dict.keys(): return c_dict[word] else: return word # stop words # add more, ex: news outlet name add_stop = ['said', 'say', '...', 'like', 'cnn', 'ad', 'bbc'] stop_words = ENGLISH_STOP_WORDS.union(add_stop) punc = list(set(string.punctuation)) def process_text(text): # if isinstance(text, float): # print(text) # return text = casual_tokenizer(text) text = [each.lower() for each in text] text = [re.sub('[0-9]+', '', each) for each in text] text = [expandContractions(word) for word in text] stemmed_text = [] # We tried using Snowball Stemmer and Lancaster Stemmer ps = LancasterStemmer()
classifiers = { "Naïve Bayes": MultinomialNB(), "Support Vector Machine": LinearSVC(), "Decision Tree": tree.DecisionTreeClassifier(), "Logistic Regression": LogisticRegression() } max_features = [200, 1000, 10000] n_categories = {"Exc./Non-Exc": "nota", "1-5": "classe 2"} n_grams = {"1": (1, 1), "1-3": (1, 3)} stop_words = { "No": None, "Yes": ENGLISH_STOP_WORDS.union(get_stop_words('spanish')).union( get_stop_words('portuguese')) } graph_data = [] for category_key, category in n_categories.items(): for max_feature in max_features: for gram_key, gram in n_grams.items(): for stop_key, stop_word in stop_words.items(): for class_key, classifier in classifiers.items(): count_vectorizer = CountVectorizer( analyzer='word', lowercase=True, stop_words=stop_word, ngram_range=gram, max_features=max_feature)
def crossValidationRoc(df, method, n_components, category): # Add noisy features random_state = np.random.RandomState(0) classifier = svm.SVC(kernel='linear', probability=True, random_state=random_state) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] avgAccuracy = 0 nFolds = 10 kf = KFold(n_splits=nFolds) fold = 0 my_additional_stop_words = [ 'said', 'th', 'month', 'much', 'thing', 'say', 'says' ] stop_words = ENGLISH_STOP_WORDS.union(my_additional_stop_words) count_vect = TfidfVectorizer(stop_words=stop_words) #count_vect = CountVectorizer(stop_words=stop_words) count_vect.fit(df['Content'] + df['Title']) svd = TruncatedSVD(n_components=n_components) svd.fit(count_vect.transform(df['Content'] + df['Title'])) for train_index, test_index in kf.split(df): X_train_counts = count_vect.transform(df['Content'].iloc[train_index]) X_train_counts = np.add( X_train_counts, count_vect.transform(df['Title'].iloc[train_index]) * 2) X_test_counts = count_vect.transform(df['Content'].iloc[test_index]) X_test_counts = np.add( X_test_counts, count_vect.transform(df['Title'].iloc[test_index]) * 2) X_train_counts = svd.transform(X_train_counts) X_test_counts = svd.transform(X_test_counts) probas_ = classifier.fit( X_train_counts, df['Category'].iloc[train_index]).predict_proba(X_test_counts) # Compute ROC curve and area the curve test1 = label_binarize( df['Category'].iloc[test_index], classes=["Business", "Film", "Football", "Politics", "Technology"]) fpr, tpr, thresholds = roc_curve(test1[:, categories_map[category]], probas_[:, categories_map[category]]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (fold, roc_auc)) print "Fold " + str(fold) fold += 1 plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= 10 mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic ' + category) plt.legend(loc="lower right") plt.savefig('output/' + category + '_roc') plt.close() return avgAccuracy
filename): ## function to read the dataset in the correct format. with open(filename) as f: dataset = f.read().splitlines() X = [i.split('\t')[0] for i in dataset] y = [i.split('\t')[1] for i in dataset] y = [0 if x == 'democrat' else 1 for x in y] return X, y Xtrain, ytrain = read_dataset('train_newline.txt') Xtest, ytest = read_dataset('dev_newline.txt') X = Xtrain + Xtest stop_words_list = ENGLISH_STOP_WORDS.union([ u'http', u'rt', u'amp', u'just', u'bit', u'ly', u'com', u'url', u'tinyurl', u'ow', u'twurl' ]) ## added these stop words ## after observing the top features in the different models ### Uni-gram Model: dic = CountVectorizer(input=X, ngram_range=(1, 1), analyzer='word', stop_words=stop_words_list) vecs = dic.fit_transform(X) features = dic.fit(X).get_feature_names( ) ### getting the feature names of the different features to be used in the classifier. trainvecs = vecs[0:40000, :]
def __init__(self): # Build a list of stop words that I don't want to use as features. These are often '.' but maybe other ones down the road my_stop_words = ['.', '(', ')', ' ', ' .', '..', ').', ' )', ' , ', ' ,'] stop_words = ENGLISH_STOP_WORDS.union(my_stop_words) self.vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1,7), stop_words='english', min_df = 1, max_df=1.0)
# Drop emails with empty body, to or from_ columns. email_df.drop(email_df.query("body == '' | to == '' | from_ == ''").index, inplace=True) #Preview dataframe. print("\nDataframe preview: \n", email_df.head()) #Print unique email addresses. print("\nUnique FROM email addresses:", len(email_df.from_.unique())) print("Unique TO email addresses:", len(email_df.to.unique())) #Tokenize the bodies and convert them into a document-term matrix: #Adding extra stop-words that appeared frequently in the dataset, but were not if interest. stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient']) #Vectorizer. vect = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.3, min_df=2) X = vect.fit_transform(email_df.body) features = vect.get_feature_names() #Print the top terms across all documents. print("\nMost frequent terms in the dataset: \n", top_mean_feats(X, features, None, 0.1, 10)) #Data classification:
def main(): #------------------------------DATA---------------------------------- train_data=pd.read_csv('train_set.csv',sep="\t") test_data=pd.read_csv('test_set.csv',sep="\t") train_data.drop('RowNum',axis=1) #ignore rownum test_data.drop('RowNum',axis=1) #------------------------------Processing---------------------------- extra_words=["said","say","seen","come","end","came","year","years","new","saying"] #extra stopwords stopwords=ENGLISH_STOP_WORDS.union(extra_words) tfidf=TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words=stopwords) #convert to tf-idf tsvd=TruncatedSVD(n_components=200, algorithm='randomized', random_state=42) #set dimensions set(train_data['Category']) #check categories le=preprocessing.LabelEncoder() #set labels le.fit(train_data["Category"]) #fit them to the number of our categories y_train=le.transform(train_data["Category"]) #transform categories set(y_train) count_vectorizer=CountVectorizer(stop_words=stopwords) #set stopwords for vectorizer X_trainNoLSI=count_vectorizer.fit_transform(train_data['Content']) #vectorize out data tsvd.fit(X_trainNoLSI) #truncate data X_train=tsvd.transform(X_trainNoLSI) #store them test_noLSI=count_vectorizer.transform(test_data['Content']) #test data test=tsvd.transform(test_noLSI) k_fold = KFold(n_splits=10) #10 fold validation #--------------------------------SVM--------------------------------- clf=svm.SVC(kernel='rbf', C=100, gamma='auto') #algorithm for application clf.fit(X_train, y_train) y_pred=clf.predict(test) #--------------------------------SVM_scores-------------------------- print "SVM scores:" SVMprecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='precision_micro') svm_prec=SVMprecs.mean() print "precision:" ,svm_prec SVMrecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='recall_micro') svm_rec=SVMrecs.mean() print "recall:" ,svm_rec SVMfms=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro') svm_fm=SVMfms.mean() print "F-measure:" ,svm_fm SVMaccs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='accuracy') svm_acc=SVMaccs.mean() print "accuracy:" ,svm_acc #---------------------------------RF--------------------------------- clf=RandomForestClassifier(max_depth=6,random_state=1) clf.fit(X_train,y_train) y_pred=clf.predict(test) #---------------------------------RF_scores-------------------------- print "RF scores:" RFprecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='precision_micro') rf_prec=RFprecs.mean() print "precision:" ,rf_prec RFrecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='recall_micro') rf_rec=RFrecs.mean() print "recall:" ,rf_rec RFfms=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro') rf_fm=RFfms.mean() print "F-measure:" ,rf_fm RFaccs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='accuracy') rf_acc=RFaccs.mean() print "accuracy:" ,rf_acc #----------------------------------MNB-------------------------------- clf=MultinomialNB() clf.fit(X_trainNoLSI,y_train) y_pred=clf.predict(test_noLSI) #----------------------------------MNB_scores------------------------- print "MNB scores:" MNBprecs=cross_val_score(clf, X_trainNoLSI, y_train, cv=k_fold, scoring='precision_micro') mnb_prec=MNBprecs.mean() print "precision:" ,mnb_prec MNBrecs=cross_val_score(clf, X_trainNoLSI, y_train, cv=k_fold, scoring='recall_micro') mnb_rec=MNBrecs.mean() print "recall:" ,mnb_rec MNBfms=cross_val_score(clf, X_trainNoLSI, y_train, cv=k_fold, scoring='f1_micro') mnb_fm=MNBfms.mean() print "F-measure:" ,mnb_fm MNBaccs=cross_val_score(clf, X_trainNoLSI, y_train, cv=k_fold, scoring='accuracy') mnb_acc=MNBaccs.mean() print "accuracy:" ,mnb_acc #-----------------------------------K-Nearest_Neighbor------------------ clf=knn.myKNN(10) # K=10,check knn_functions.py(imported) clf.fit(X_train, y_train) y_pred=clf.predict(test) #---------------------------------KNN_scores-------------------------- print "KNN scores:" KNNprecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='precision_micro') knn_prec=KNNprecs.mean() print "precision:" ,knn_prec KNNrecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='recall_micro') knn_rec=KNNrecs.mean() print "recall:" ,knn_rec KNNfms=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro') knn_fm=KNNfms.mean() print "F-measure:" ,knn_fm KNNaccs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='accuracy') knn_acc=KNNaccs.mean() print "accuracy:" ,knn_acc #---------------------------------------------------------------------- # My Method #---------------------------------------------------------------------- #our method #data punctuation test_data['Content']=test_data['Content'].str.replace('[^\w\s]', '') train_data['Content']=train_data['Content'].str.replace('[^\w\s]', '') #convert multiple spaces to one test_data['Content']=test_data['Content'].str.replace('\s+', ' ') train_data['Content']=train_data['Content'].str.replace('\s+', ' ') #same process as before set(train_data['Category']) le=preprocessing.LabelEncoder() le.fit(train_data["Category"]) y_train=le.transform(train_data["Category"]) set(y_train) X_train=count_vectorizer.fit_transform(train_data['Content']) test=count_vectorizer.transform(test_data['Content']) #usage of MNB max=0.0 maxi=0.0 i=0.01 #search for the best smoothing parameter(alpha) while i<1.0: clf=MultinomialNB(alpha=i) clf.fit(X_train,y_train) y_pred=clf.predict(test) myprecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='precision_micro') my_prec=myprecs.mean() if my_prec>max: max=my_prec maxi=i i+=0.01 print "My Method scores:" clf=MultinomialNB(alpha=maxi, fit_prior=True) clf.fit(X_train,y_train) the_pred=clf.predict(test) print "precision:" ,max myrecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='recall_micro') my_rec=myrecs.mean() print "recall:" ,my_rec myfms=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro') my_fm=myfms.mean() print "F-measure:" ,my_fm myaccs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='accuracy') my_acc=myaccs.mean() print "accuracy:" ,my_acc #------------------------------------CSV--------------------------------- #my method csv output='testSet_categories.csv' predicted=le.inverse_transform(the_pred) testingfile=pd.DataFrame({'ID': test_data['Id'], 'Predicted_Category': list(predicted)}, columns=['ID', 'Predicted_Category']) testingfile.to_csv(output,encoding='utf-8',index=False,sep='\t') #results csv output='EvaluationMetric_10fold.csv' d={'StatisticMeasure': ['Accuracy','Precision','Recall','F-Measure'],'Naive Bayes':[mnb_acc,mnb_prec,mnb_rec,mnb_fm],'Random Forest':[rf_acc,rf_prec,rf_rec,rf_fm],'SVM': [svm_acc,svm_prec,svm_rec,svm_fm],'KNN': [knn_acc,knn_prec,knn_rec,knn_fm] ,'My Method': [my_acc,max,my_rec,my_fm]} df=pd.DataFrame(data=d,columns=['StatisticMeasure','Naive Bayes','Random Forest','SVM','KNN','My Method']) df.to_csv(output,encoding='utf-8',index=False,sep='|')
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS from nltk.corpus import stopwords import nltk, string rm_pun = dict((ord(char), None) for char in string.punctuation) stmr = nltk.stem.porter.PorterStemmer() def nml(t): # remove punctuation, stem tk = nltk.word_tokenize(t.lower().translate(rm_pun)) r = [stmr.stem(i) for i in tk] # print("\nnormalize") # for w in re: # print(w) return r lst = [ "cnnbrk", '’', '“', 'https…', 'htt…', 'h…', 's', 't', "cnnbrk…", "”", "…", "wo…", "”", "…", "w…", "a…", "m…", "i…", "t…", "‘", "an…", "g…", "d…", "to…", "p…", "o…", "is…", "in…", "wh…", "c…", "…", "so…", "y…", "and…", "मे", "तो", "से", "be…", "re…", "are…", "as…", "no…", "r…", "ft…", "they…", "—", "not…", "f…", "l…", "e…", "it…", "u…", "b…", "n…", "tr…", "we…" ] stpw = ENGLISH_STOP_WORDS.union(stopwords.words('english')).union(lst)
print(len(have_cancel), 'records have "cancel*" in them') canceled_cats = Counter([i['category'] for i in have_cancel]) sorted(canceled_cats.items(), key=itemgetter(1), reverse=True)[0:10] # #Set up the vectorisers and classifiers # The per-record text data is fairly sparse and the vocabulary is quite big overall, so it's worth trying different vectorisers. # In[102]: from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as ESW ESW = ESW.union({'cancelled', 'canceled'}) from sklearn.pipeline import Pipeline from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.cross_validation import cross_val_score, KFold def my_tokeniser(string): ''' This can be changed to result in more sophisticated word detection. For now, it just splits up into alpha-only chunks, strips numbers. Preserves hyphenated and apostrophed words but ignores other punct. Gets rid of single-char stuff. ''' pattern = re.compile("[A-Za-z0-9\-']*[^\W]") return [i for i in re.findall(pattern, string) if i.isnumeric() == False and len(i) > 1]
from nltk.stem.wordnet import WordNetLemmatizer import re import numpy as np def testSet_categoriesCSV(predicted_categories, ids): d = {'ID': pd.Series(ids), 'Predicted_Category': pd.Series(predicted_categories)} df = pd.DataFrame(d) df.to_csv('Produced_Files/testSet_categories.csv', sep='\t', index=False, columns=['ID', 'Predicted_Category']) size = 10000 components = 160 my_additional_stop_words = ['people', 'said', 'did', 'say', 'says', 'year', 'day', 'just', 'good', 'come', 'make', 'going', 'having', 'like', 'need', 'given', 'got'] vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS.union(my_additional_stop_words)) le = preprocessing.LabelEncoder() lsi_model = TruncatedSVD(n_components=components) ps = PorterStemmer() lmtzr = WordNetLemmatizer() clf = svm.SVC(kernel='rbf', C=1, gamma=1) # clf = SGDClassifier() # ---------------------------------------------------------------------------------------------------------------------- # ----------------------------------------------------TRAIN------------------------------------------------------------- dataset = pd.read_csv('../datasets/project_1/train_set.csv', sep="\t") #dataset = dataset[0:size] le.fit(dataset["Category"]) y = le.transform(dataset["Category"])
def add_stop_words(self): if self.stop_words is not None: words = self._split_on_spaces(self.stop_words) self.stop_words = ENGLISH_STOP_WORDS.union(words)
'trade':9} data = [] target = [] docs = reuters.fileids() for doc in docs: # Check if the document is only related to 1 class and that class is in category_dict if len(reuters.categories(doc)) == 1 and reuters.categories(doc)[0] in category_dict: data.append(" ".join(reuters.words(doc))) # Text of the document target.append(category_dict[reuters.categories(doc)[0]]) # Index for the class print("Dataset REUTERS loaded...") # Pre-process the dataset print("Pre-processing the dataset...") stemmer = PorterStemmer() # Define the type of stemmer to use additional_stop_words = [] stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words) stop_words = set([stemmer.stem(word) for word in stop_words]) # Stem the stop words for larger detection processed_data = [] id_to_delete = [] for i, doc in enumerate(data): tokenized_doc = list(simple_preprocess(doc, deacc=True, min_len=2)) stemmed_doc = [] for word in tokenized_doc: stemmed_word = stemmer.stem(word) if stemmed_word not in stop_words: stemmed_doc.append(stemmed_word) #[stemmer.stem(word) for word in tokenized_doc if word not in stop_words] if stemmed_doc == []: # Empty document after pre-processing: to be removed id_to_delete.append(i) else: processed_data.append(stemmed_doc)