def __init__(self): stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient']) self.vec = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.3, min_df=2) self.emails = read_email_bodies() # train on the given email data. self.train()
def add_stop_words(self): if self.stop_words is None: self.stop_words = list(ENGLISH_STOP_WORDS) logging.info("using default stop words") else: words = self._split_on_spaces(self.stop_words) self.stop_words = list(ENGLISH_STOP_WORDS.union(words)) logging.info("using custom stop words") logging.debug("stop words:%s" % self.stop_words)
def lda(text, n_features, n_topics, n_top_words): """ perform latent dirichlet allocation input (array): an array of strings """ # add to stop words # the word inapplicable is a result of the questionnaire stop_words = ENGLISH_STOP_WORDS.union(['inapplicable']) tf_vectorizer = CountVectorizer(max_df=0.85, min_df=0., max_features=n_features, stop_words=stop_words) tf = tf_vectorizer.fit_transform(text) model = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) model.fit(tf) tf_feature_names = tf_vectorizer.get_feature_names() tops = get_top_words(model, tf_feature_names, n_top_words) return tops
def top_n_words_heatmap(self, n=10, ngrams=(1, 1), by_decade=True): """Produces a heatmap for top N words or their combinations over the years""" df = self.data.copy() df.reset_index(drop=True, inplace=True) sw = ENGLISH_STOP_WORDS.union(CSW) c = CountVectorizer(stop_words=sw, ngram_range=ngrams) m = c.fit_transform(df.lyrics) rev = {it: ind for ind, it in c.vocabulary_.items()} words = [] for oi, m0 in enumerate(m): words.append({oi: {rev[ind]: it for ind, it in enumerate(m0.toarray().reshape(-1)) if it > 0}}) words_df = [pd.DataFrame(word) for word in words] total_words = pd.concat(words_df, axis=1, sort=False) trans = total_words.T if by_decade: trans["_year"] = df["decade"].fillna(0) else: trans["_year"] = df["year"].fillna(0) trans["_year"] = pd.to_numeric(trans["_year"].fillna(0), downcast="integer") words_by_year = trans.groupby("_year").sum() top10 = {y: z.nlargest(n) for y, z in words_by_year.iterrows()} top10_df = pd.concat(top10, axis=1, sort=False) top10_df["sum"] = top10_df.sum(axis=1) top10_df.sort_values(by="sum", ascending=False, inplace=True) top10_df.drop("sum", axis=1, inplace=True) fig = plt.figure(figsize=(15, 10)) sns.heatmap(top10_df.T, square=True, cmap="YlGnBu", cbar_kws={"orientation":"horizontal"})
def labelClustersWKeywords(labels, myReader, num_clusters): top_features_list = [] print(myReader) for cluster in range(num_clusters): indices = [index for index, clusterNum in enumerate(labels) if clusterNum == cluster] # indices of documents in cluster clusterCorpus = [doc_dict['negative_feedback'] for (docnum, doc_dict) in myReader.iter_docs() if docnum in indices] # documents in cluster custom_stop_words = ENGLISH_STOP_WORDS.union(["firefox"]) vectorizer = TfidfVectorizer(stop_words=custom_stop_words) X_tf = vectorizer.fit_transform(clusterCorpus) response = vectorizer.transform(clusterCorpus) feature_names = vectorizer.get_feature_names() top_n = 5 feature_name_occurences = np.nonzero(response.toarray())[1] most_common_n = collections.Counter(feature_name_occurences).most_common(top_n) top_features = [feature_names[feature[0]] for feature in most_common_n] top_features_list.append(top_features) feature_names_df = pd.DataFrame(top_features_list, columns=['1', '2', '3', '4', '5']) return feature_names_df
def main(): #ta idia opws kai sto classification df = pd.read_csv("datasets/train_set.csv", sep="\t") with open('extraStopWords.json', 'r') as extraStopWords: extraStopWords = json.load(extraStopWords) stop_words = ENGLISH_STOP_WORDS.union(extraStopWords) count_vect = CountVectorizer(stop_words=stop_words) X_train_counts = count_vect.fit_transform(df.Content) #pairnoume ta test df2 = pd.read_csv("datasets/test_set.csv", sep="\t") X_test_counts = count_vect.transform(df2.Content) #SVM Linear dioti htane o kaluteros clf_cv = MultinomialNB().fit(X_train_counts, np.array(df.Category)) y_pred = clf_cv.predict(X_test_counts) f = open("testSet_categories.csv", "w") f.write("ID\tPredicted_Category\n") i = 0 for pred in y_pred: f.write(str(df2.Id[i]) + "\t" + pred + "\n") i += 1 f.close()
def main(): #diavazoume to csv se panda kai meta ftiaxnoume ton vectorizer + kanoume trubcated gia na meiwsoume tis diastaseis with open('extraStopWords.json','r') as extraStopWords: extraStopWords = json.load(extraStopWords) stopWords = ENGLISH_STOP_WORDS.union(extraStopWords) df = pd.read_csv("datasets/train_set.csv", sep="\t") count_vect = CountVectorizer(stop_words=stopWords) X_train_counts = count_vect.fit_transform(df.Content) svd = TruncatedSVD(n_components = 60) X_train_counts = svd.fit(X_train_counts) #edw dhmiourgoume to object gia na kanoume to cross validation kf = KFold(n_splits = 10) #fold = 0 #edw exoume tous metrites pou xreiazontai #metrame se kathe epanalipsi to apotelesma kai sto telos kanoume total/10 #0 einai gia svm #1 gia Nayve #2 gia Rnadom #3 gia KNN class_list = [Metrics_for_Class() for i in range(0,4)] #oi katigories categories = ["Technology", "Football", "Film", "Business","Politics"] #kratame plhroforia gia to roc plot folist = [] tlist = [] plist = [] filist = [] blist = [] #edw xwrizoum for train_index, test_index in kf.split(df.Content): #pleon kanoume mono transform edw kai oxi fit dioti tha xasoume plhrofories an kanoume neo fit X_train_counts3 = count_vect.transform(np.array(df.Content)[train_index]) X_train_counts2 = svd.transform(X_train_counts3) #to idio me to panw X_test_counts3 = count_vect.transform(np.array(df.Content)[test_index]) X_test_counts2 = svd.transform(X_test_counts3) #SVM if sys.argv[1] == "SVM": #print("SVM STARTED") place = 0 #parameters = {'kernel':('linear', 'rbf')} svr = svm.SVC(kernel = "linear") svr.fit(X_train_counts2, np.array(df.Category)[train_index]) y_pred = svr.predict(X_test_counts2) y_true = np.array(df.Category)[test_index] class_list[0].rec += recall_score(y_true,y_pred,average = "macro") class_list[0].acc += accuracy_score(y_true,y_pred) class_list[0].prec += precision_score(y_true,y_pred,average = "macro") class_list[0].fl_sc += f1_score(y_true, y_pred,average = "macro") #NayveBayes elif sys.argv[1] == "NAYVE": #print("NAYVE_STARTED") place = 1 clf_cv = MultinomialNB().fit(X_train_counts3, np.array(df.Category)[train_index]) y_pred = clf_cv.predict(X_test_counts3) y_true = np.array(df.Category)[test_index] class_list[1].rec += recall_score(y_true,y_pred,average = "macro") class_list[1].acc += accuracy_score(y_true,y_pred) class_list[1].prec += precision_score(y_true,y_pred,average = "macro") class_list[1].fl_sc += f1_score(y_true, y_pred,average = "macro") #RandomForest elif sys.argv[1] == "RANDOM_FOREST": #print("RANDOM_FOREST_STARTED") place = 2 clf_rf = RandomForestClassifier(n_estimators=10).fit(X_train_counts2, np.array(df.Category)[train_index]) y_pred = clf_rf.predict(X_test_counts2) y_true = np.array(df.Category)[test_index] class_list[2].rec += recall_score(y_true,y_pred,average = "macro") class_list[2].acc += accuracy_score(y_true,y_pred) class_list[2].prec += precision_score(y_true,y_pred,average = "macro") class_list[2].fl_sc += f1_score(y_true, y_pred,average = "macro") #KNN elif sys.argv[1] == "KNN": place = 3 K = 7 clf_kn = knn_classifier(K).fit(X_train_counts2,np.array(df.Category)[train_index]) y_pred = clf_kn.predict(X_test_counts2,X_train_counts2,K) y_true = np.array(df.Category)[test_index] class_list[3].rec += recall_score(y_true,y_pred,average = "macro") class_list[3].acc += accuracy_score(y_true,y_pred) class_list[3].prec += precision_score(y_true,y_pred,average = "macro") class_list[3].fl_sc += f1_score(y_true, y_pred,average = "macro") #upologismos meswn orwn class_list[place].rec = float(class_list[place].rec) / 10 class_list[place].acc = float(class_list[place].acc) / 10 class_list[place].prec = float(class_list[place].prec) / 10 class_list[place].fl_sc = float(class_list[place].fl_sc) / 10 #class_list[place].roc_auc = float(class_list[place].roc_auc) / 10 #print ta apotelesmata f = open("EvaluationMetric_" + sys.argv[1] + ".csv", "w") f.write("Statistic_Metrics\t") if sys.argv[1] == "SVM": f.write("SVM") elif sys.argv[1] == "NAYVE": f.write("Naive Bayes") elif sys.argv[1] == "RANDOM_FOREST": f.write("Random Forest") elif sys.argv[1] == "KNN": f.write("KNN") f.write("\n") #grpasimo sto csv f.write("Accuracy\t") f.write(str(class_list[place].acc) + "\n") f.write("Presicion\t") f.write(str(class_list[place].prec) + "\n") f.write("Recall\t") f.write(str(class_list[place].rec) + "\n") f.write("F_Measure\t") f.write(str(class_list[place].fl_sc) + "\n") f.close()
color='#eeeeee', zorder=1) ax.set_xlabel(xlabel, labelpad=20, weight='bold', size=12) ax.set_ylabel(ylabel, labelpad=20, weight='bold', size=12) ax.set_title(title) ax.xaxis.set_major_formatter(StrMethodFormatter('{x:,g}')) #Run when you want to shuffle and remake a the CSV file. # load_slice_dataframes() df_kaggle_reviews = pd.read_csv('.\labeled_data.csv') df_kaggle_reviews['reviews_stem'] = get_stemmed_text( preprocess_reviews(df_kaggle_reviews.reviews)) my_stop_words = get_stemmed_text(ENGLISH_STOP_WORDS.union(('and', 'or', 'if'))) # my_pattern = r'\b[^\d\W][^\d\W]+\b' #token_pattern= # # TFIDF frequnsie of the words so how many times each word is apper in the string # vect_Tfid = TfidfVectorizer(ngram_range=(1, 2), max_features=100, # stop_words=my_stop_words).fit(df_kaggle_reviews['reviews_stem']) # X_txt_Tfid = vect_Tfid.transform(df_kaggle_reviews['reviews_stem']) # df_Tfid = pd.DataFrame(X_txt_Tfid.toarray(), columns=vect_Tfid.get_feature_names()) # min_df ( = 5): defines the minimum frequency of a word for it to be counted as a feature vect_BOW = CountVectorizer(ngram_range=(1, 2), stop_words=my_stop_words, max_features=2000, binary=True).fit(df_kaggle_reviews.reviews_stem) X_txt_BOW = vect_BOW.transform(df_kaggle_reviews.reviews_stem)
############################################################################### _20news = fetch_20newsgroups(subset="all") print("Dataset 20NEWS loaded...") data = _20news.data target = _20news.target ############################################################################### # Pre-process the dataset ############################################################################### print("Pre-processing the dataset...") stemmer = PorterStemmer() # Define the type of stemmer to use additional_stop_words = [ 'edu', 'com', 'gov', 'ca', 'mit', 'uk', 'subject', 'lines', 'organization', 'writes', 'msg', 'article', 'university', 'does', 'posting', 'thanks', 'don', 'know', 'help', 'use', 'copy' ] stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words) stop_words = set([stemmer.stem(word) for word in stop_words ]) # Stem the stop words for larger detection processed_data = [] id_to_delete = [] for i, doc in enumerate(data): tokenized_doc = list(simple_preprocess(doc, deacc=True, min_len=2)) stemmed_doc = [] for word in tokenized_doc: stemmed_word = stemmer.stem(word) if stemmed_word not in stop_words: stemmed_doc.append(stemmed_word) #[stemmer.stem(word) for word in tokenized_doc if word not in stop_words] if stemmed_doc == []: # Empty document after pre-processing: to be removed id_to_delete.append(i) else:
def __init__(self): # Build a list of stop words that I don't want to use as features. These are often '.' but maybe other ones down the road my_stop_words = ['.', '(', ')', ' ', ' .', '..', ').', ' )', ' , ', ' ,'] stop_words = ENGLISH_STOP_WORDS.union(my_stop_words) self.vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1,7), stop_words='english', min_df = 1, max_df=1.0)
with open("additional_stopwords.txt", "r") as textfile: additional_stopwords = textfile.read().split('\n') stopwords_list = list(string.punctuation) stopwords_list += ['....', '...', '..', '.....', 'im'] stopwords_list += stopwords.words('english') # nltk stopwords_list += get_stop_words('en') # stop words stopwords_list = list(set(stopwords_list)) stopwords_list = [w for w in stopwords_list if w not in 'not'] print('Length of standard stopwords list: {}'.format(len(stopwords_list))) ext_stopwords_list = stopwords_list ext_stopwords_list += spacy.lang.en.stop_words.STOP_WORDS # spacy ext_stopwords_list += additional_stopwords ext_stopwords_list = ENGLISH_STOP_WORDS.union( stopwords_list) # sklearn stopwords ext_stopwords_list = list(set(ext_stopwords_list)) ext_stopwords_list = [w for w in ext_stopwords_list if w not in 'not'] print('Length of extended stopwords list: {}'.format(len(ext_stopwords_list))) with open('vocab_20k.txt', 'r', encoding="utf8") as f: extended_vocab_20k = f.read().splitlines() with open('contractions.txt', 'r') as f: contractions = eval(f.read()) def find_review_errors(df): """ List unique identity key for missing and duplicate reviews,
def main(): #------------------------------DATA---------------------------------- train_data=pd.read_csv('train_set.csv',sep="\t") test_data=pd.read_csv('test_set.csv',sep="\t") train_data.drop('RowNum',axis=1) #ignore rownum test_data.drop('RowNum',axis=1) #------------------------------Processing---------------------------- extra_words=["said","say","seen","come","end","came","year","years","new","saying"] #extra stopwords stopwords=ENGLISH_STOP_WORDS.union(extra_words) tfidf=TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words=stopwords) #convert to tf-idf tsvd=TruncatedSVD(n_components=200, algorithm='randomized', random_state=42) #set dimensions set(train_data['Category']) #check categories le=preprocessing.LabelEncoder() #set labels le.fit(train_data["Category"]) #fit them to the number of our categories y_train=le.transform(train_data["Category"]) #transform categories set(y_train) count_vectorizer=CountVectorizer(stop_words=stopwords) #set stopwords for vectorizer X_trainNoLSI=count_vectorizer.fit_transform(train_data['Content']) #vectorize out data tsvd.fit(X_trainNoLSI) #truncate data X_train=tsvd.transform(X_trainNoLSI) #store them test_noLSI=count_vectorizer.transform(test_data['Content']) #test data test=tsvd.transform(test_noLSI) k_fold = KFold(n_splits=10) #10 fold validation #--------------------------------SVM--------------------------------- clf=svm.SVC(kernel='rbf', C=100, gamma='auto') #algorithm for application clf.fit(X_train, y_train) y_pred=clf.predict(test) #--------------------------------SVM_scores-------------------------- print "SVM scores:" SVMprecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='precision_micro') svm_prec=SVMprecs.mean() print "precision:" ,svm_prec SVMrecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='recall_micro') svm_rec=SVMrecs.mean() print "recall:" ,svm_rec SVMfms=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro') svm_fm=SVMfms.mean() print "F-measure:" ,svm_fm SVMaccs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='accuracy') svm_acc=SVMaccs.mean() print "accuracy:" ,svm_acc #---------------------------------RF--------------------------------- clf=RandomForestClassifier(max_depth=6,random_state=1) clf.fit(X_train,y_train) y_pred=clf.predict(test) #---------------------------------RF_scores-------------------------- print "RF scores:" RFprecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='precision_micro') rf_prec=RFprecs.mean() print "precision:" ,rf_prec RFrecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='recall_micro') rf_rec=RFrecs.mean() print "recall:" ,rf_rec RFfms=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro') rf_fm=RFfms.mean() print "F-measure:" ,rf_fm RFaccs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='accuracy') rf_acc=RFaccs.mean() print "accuracy:" ,rf_acc #----------------------------------MNB-------------------------------- clf=MultinomialNB() clf.fit(X_trainNoLSI,y_train) y_pred=clf.predict(test_noLSI) #----------------------------------MNB_scores------------------------- print "MNB scores:" MNBprecs=cross_val_score(clf, X_trainNoLSI, y_train, cv=k_fold, scoring='precision_micro') mnb_prec=MNBprecs.mean() print "precision:" ,mnb_prec MNBrecs=cross_val_score(clf, X_trainNoLSI, y_train, cv=k_fold, scoring='recall_micro') mnb_rec=MNBrecs.mean() print "recall:" ,mnb_rec MNBfms=cross_val_score(clf, X_trainNoLSI, y_train, cv=k_fold, scoring='f1_micro') mnb_fm=MNBfms.mean() print "F-measure:" ,mnb_fm MNBaccs=cross_val_score(clf, X_trainNoLSI, y_train, cv=k_fold, scoring='accuracy') mnb_acc=MNBaccs.mean() print "accuracy:" ,mnb_acc #-----------------------------------K-Nearest_Neighbor------------------ clf=knn.myKNN(10) # K=10,check knn_functions.py(imported) clf.fit(X_train, y_train) y_pred=clf.predict(test) #---------------------------------KNN_scores-------------------------- print "KNN scores:" KNNprecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='precision_micro') knn_prec=KNNprecs.mean() print "precision:" ,knn_prec KNNrecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='recall_micro') knn_rec=KNNrecs.mean() print "recall:" ,knn_rec KNNfms=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro') knn_fm=KNNfms.mean() print "F-measure:" ,knn_fm KNNaccs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='accuracy') knn_acc=KNNaccs.mean() print "accuracy:" ,knn_acc #---------------------------------------------------------------------- # My Method #---------------------------------------------------------------------- #our method #data punctuation test_data['Content']=test_data['Content'].str.replace('[^\w\s]', '') train_data['Content']=train_data['Content'].str.replace('[^\w\s]', '') #convert multiple spaces to one test_data['Content']=test_data['Content'].str.replace('\s+', ' ') train_data['Content']=train_data['Content'].str.replace('\s+', ' ') #same process as before set(train_data['Category']) le=preprocessing.LabelEncoder() le.fit(train_data["Category"]) y_train=le.transform(train_data["Category"]) set(y_train) X_train=count_vectorizer.fit_transform(train_data['Content']) test=count_vectorizer.transform(test_data['Content']) #usage of MNB max=0.0 maxi=0.0 i=0.01 #search for the best smoothing parameter(alpha) while i<1.0: clf=MultinomialNB(alpha=i) clf.fit(X_train,y_train) y_pred=clf.predict(test) myprecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='precision_micro') my_prec=myprecs.mean() if my_prec>max: max=my_prec maxi=i i+=0.01 print "My Method scores:" clf=MultinomialNB(alpha=maxi, fit_prior=True) clf.fit(X_train,y_train) the_pred=clf.predict(test) print "precision:" ,max myrecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='recall_micro') my_rec=myrecs.mean() print "recall:" ,my_rec myfms=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro') my_fm=myfms.mean() print "F-measure:" ,my_fm myaccs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='accuracy') my_acc=myaccs.mean() print "accuracy:" ,my_acc #------------------------------------CSV--------------------------------- #my method csv output='testSet_categories.csv' predicted=le.inverse_transform(the_pred) testingfile=pd.DataFrame({'ID': test_data['Id'], 'Predicted_Category': list(predicted)}, columns=['ID', 'Predicted_Category']) testingfile.to_csv(output,encoding='utf-8',index=False,sep='\t') #results csv output='EvaluationMetric_10fold.csv' d={'StatisticMeasure': ['Accuracy','Precision','Recall','F-Measure'],'Naive Bayes':[mnb_acc,mnb_prec,mnb_rec,mnb_fm],'Random Forest':[rf_acc,rf_prec,rf_rec,rf_fm],'SVM': [svm_acc,svm_prec,svm_rec,svm_fm],'KNN': [knn_acc,knn_prec,knn_rec,knn_fm] ,'My Method': [my_acc,max,my_rec,my_fm]} df=pd.DataFrame(data=d,columns=['StatisticMeasure','Naive Bayes','Random Forest','SVM','KNN','My Method']) df.to_csv(output,encoding='utf-8',index=False,sep='|')
# Lower text and use translation table to remove all punctuation and digits text = text.lower().translate(t_table) # Best Stemmer for this dataset (Tested) stemmer = PorterStemmer() # stemmer = SnowballStemmer("english") # stemmer = LancasterStemmer() stems = [stemmer.stem(word.strip()) for word in text.split()] return stems print('Creating stop words (NLTK & SKLEARN) ...') # 153 stop words from NLTK nltk_stop_words = stopwords.words('english') # Combine stop words from all the stop word lists stop_words = ENGLISH_STOP_WORDS.union(nltk_stop_words) ngram = 2 min_df = 5 # Using idf print('[PorterStemmer] Converting text documents to numerical feature vectors.... aka vectorizing...') print('Ngrams = %i' % ngram) print('min_df = %i' % min_df) tfidf_vec = TfidfVectorizer(tokenizer=tokenizer, norm='l2', ngram_range=(1, ngram), sublinear_tf=True, min_df=min_df, stop_words=stop_words) # Fit the vectorizer on the combined train/test abstract data tfidf_vec.fit(abstract_df.values) # Transform training and test data set to numerical feature vectors X_train_tfidf = tfidf_vec.transform(train_df['Abstract'].values)
nltk.download('stopwords') from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer corpus = [] # stopwords_set = set(stopwords.words('english')) # todo: for this dataset using empty stopwords list shows highest CV score # stopwords_set = set() # todo: but this one, without negative word-parts shows higher score than nltk 'english' list # todo: but the dataset is too small, and empty stopwords set is correct for 1 or 2 reviews more # todo: so probably it is better to use general approach with common stopwords # todo: but negative word parts should be kept as ngrams of 2 words are used from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS stopwords_set = ENGLISH_STOP_WORDS.difference( ['not', 'no', 'nor', 'none', 'never', 'nothing', 'very']) stemmer = PorterStemmer() for i in range(0, 1000): review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) review = review.lower() review = review.split() review = [ stemmer.stem(word) for word in review if word not in stopwords_set ] review = ' '.join(review) corpus.append(review) # Creating the Bag of Words model from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=1500, ngram_range=(1, 2))
## Ingest ML papers data ## ########################### data_fname = '../../data/papers.csv' data = pd.read_csv(data_fname) data.dropna(subset=['full_text'], inplace=True) # %% #################################################################### ## Ingest and incorporate custom data science specific stop words ## #################################################################### stopwords_fname = '../../data/ml_stopwords.csv' add_stop_words = pd.read_csv(stopwords_fname) new_stop_word_list = ENGLISH_STOP_WORDS.union(add_stop_words.Stopword.values) # %% ############################################### ## Ingest custom keyword lists for ML topics ## ############################################### # Load all custom keyword lists for ML topics to use as topic priors in LDA topic_priors_dir = os.fsencode('../../data/topic_priors/') topic_priors_df_list = [] base_weight = 100 for f in os.listdir(topic_priors_dir): fname = '../../data/topic_priors/' + os.fsdecode(f) topic_name = os.fsdecode(f).split('.')[0] topic_words = pd.read_csv(open(fname))
def get_random_class_labels(num=8): return np.random.choice(class_labels_all, num, replace=False) # ========================= STOP WORDS ========================= # useless_words = set([ 'postgres', 'big', 'panda', 'using', 'scikit', 'sklearn', 'apache', 'spark', 'lambda', 's3', 'does', 'looking', 'help', 'new', 'data', 'science', 'scientist', 'machine', 'learning', 'use', 'need', 'engineer', 'engineering' ]) custom_stop_words = ENGLISH_STOP_WORDS.union(useless_words).union( set(class_labels_all)) def load_sqlite(database, query=None, class_labels=None): try: connection = sqlite3.connect(database) except Exception as e: print(f"The error '{e}' occurred connecting") placeholders = ','.join('?' for label in class_labels) ### FIX ### # this query needs to be explicitely given in each notebook # to allow for different databases subreddit_query = """
import pandas as pd import matplotlib.pyplot as plt from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS from sklearn.pipeline import make_pipeline from sklearn.decomposition import PCA from sklearn.cluster import KMeans, MiniBatchKMeans from sklearn.manifold import TSNE from sklearn.decomposition import TruncatedSVD from sklearn.preprocessing import normalize from make_csv import * emails = pd.read_csv('email_dataset.csv') dataframe = pd.DataFrame(parse_into_emails(emails.message)) dataframe.drop(dataframe.query("body == '' | to == '' | from_ == ''").index, inplace=True) stopwords = ENGLISH_STOP_WORDS.union([ 'hou', 'com', 'recipient']) vect = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.6, min_df=2) X = vect.fit_transform(dataframe.body) features = vect.get_feature_names() n_clusters = 3 clf = KMeans(n_clusters=n_clusters,max_iter=100,init='k-means++', n_init=1) labels = clf.fit_predict(my_email.csv) clusters = {} n = 0 for item in labels: if item in clusters: clusters[item].append(row_dict[n]) else: clusters[item] = [row_dict[n]]
print(len(have_cancel), 'records have "cancel*" in them') canceled_cats = Counter([i['category'] for i in have_cancel]) sorted(canceled_cats.items(), key=itemgetter(1), reverse=True)[0:10] # #Set up the vectorisers and classifiers # The per-record text data is fairly sparse and the vocabulary is quite big overall, so it's worth trying different vectorisers. # In[102]: from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as ESW ESW = ESW.union({'cancelled', 'canceled'}) from sklearn.pipeline import Pipeline from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.cross_validation import cross_val_score, KFold def my_tokeniser(string): ''' This can be changed to result in more sophisticated word detection. For now, it just splits up into alpha-only chunks, strips numbers. Preserves hyphenated and apostrophed words but ignores other punct. Gets rid of single-char stuff. ''' pattern = re.compile("[A-Za-z0-9\-']*[^\W]") return [i for i in re.findall(pattern, string) if i.isnumeric() == False and len(i) > 1]
# Code used in part 2 of How I used machine learning to classify emails and turn them into insights. from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS from sklearn.metrics.pairwise import linear_kernel import pandas as pd from helpers import parse_into_emails from query import EmailDataset # Just like in part_1, read and preprocess emails emails = pd.read_csv('split_emails.csv') email_df = pd.DataFrame(parse_into_emails(emails.message)) email_df.drop(email_df.query("body == '' | to == '' | from_ == ''").index, inplace=True) stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient']) vec = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.3, min_df=2) vec_train = vec.fit_transform(email_df.body) # print out the vector of the first email # print(vec_train[0:1]) # Find cosine similarity between the first email and all others. cosine_sim = linear_kernel(vec_train[0:1], vec_train).flatten() # print out the cosine similarities # print(cosine_sim) # Finding emails related to a query. query = "john" # Transform the query into the original vector vec_query = vec.transform([query])
# Import the vectorizer and default English stop words list from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS # Define the stop words my_stop_words = ENGLISH_STOP_WORDS.union( ['airline', 'airlines', '@', 'am', 'pm']) # Build and fit the vectorizers vect1 = CountVectorizer(stop_words=my_stop_words) vect2 = CountVectorizer(stop_words=ENGLISH_STOP_WORDS) vect1.fit(tweets.text) vect2.fit(tweets.negative_reason) # Print the last 15 features from the first, and all from second vectorizer print(vect1.get_feature_names()[-15:]) print(vect2.get_feature_names())
input2 = np.column_stack( (input_texts.reshape(-1, 1), target_texts.reshape(-1, 1))) df = pd.DataFrame(input2, columns=['Body', 'target']) mail_df = df.copy() # mail_df.drop(emails.query( # "Body == '' | To == '' | 'Sender Email' == ''" # ).index, inplace=True) mail_df = mail_df[mail_df['Body'].isnull() == False] ''' no stop words vect = TfidfVectorizer(stop_words='english', max_df=0.50, min_df=2) X = vect.fit_transform(mail_df.Body) ''' stopwords = ENGLISH_STOP_WORDS.union( ['ect', 'hou', 'com', 'recipient', 'tom', 'mary', 'don']) vect = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.3, min_df=2) X = vect.fit_transform(mail_df.Body) def pca_scatter(): X_dense = X.todense() coords = PCA(n_components=2).fit_transform(X_dense) plt.scatter(coords[:, 0], coords[:, 1], c='m') plt.show() def top_tfidf_feats(row, features, top_n=20): topn_ids = np.argsort(row)[::-1][:top_n]
def main(): stop_words = set(STOPWORDS) stop_words.update(ENGLISH_STOP_WORDS) #extra stop words extra_words=["said","say","seen","come","end","came","year","years","new","saying"] stop_words = ENGLISH_STOP_WORDS.union(extra_words) df = pd.read_csv('/kaggle/input/question1/train.csv') # df=df.head(n=1000) cat_business = [] cat_entertainment = [] cat_health = [] cat_technology = [] #store the content for each category for index in range(len(df.Label)): cat = df.Label[index] if cat == "Business": cat_business.append(df.Content[index]) elif cat == "Entertainment": cat_entertainment.append(df.Content[index]) elif cat == "Health": cat_health.append(df.Content[index]) elif cat == "Technology": cat_technology.append(df.Content[index]) str_bus = ''.join(cat_business) str_ent = ''.join(cat_entertainment) str_hea = ''.join(cat_health) str_tec = ''.join(cat_technology) #produce wordcloud for each category cloud = WordCloud(stopwords=stop_words) w = cloud.generate(str_bus) plt.figure() plt.imshow(w) plt.title("Business") plt.axis("off") plt.savefig('/kaggle/working/Business.png') w = cloud.generate(str_ent) plt.figure() plt.imshow(w) plt.title("Entertainment") plt.axis("off") plt.savefig('/kaggle/working/Entertainment.png') w = cloud.generate(str_hea) plt.figure() plt.title("Health") plt.imshow(w) plt.axis("off") plt.savefig('/kaggle/working/Health.png') w = cloud.generate(str_tec) plt.figure() plt.imshow(w) plt.title("Technology") plt.axis("off") plt.savefig('/kaggle/working/Technology.png')
from nltk.stem import WordNetLemmatizer class LemmaTokenizer: def __init__(self): self.wnl = WordNetLemmatizer() def __call__(self, doc): return [self.wnl.lemmatize(t) for t in word_tokenize(doc)] #vect = CountVectorizer(tokenizer=LemmaTokenizer()) #print('test') # esta dando certo ate aqui my_stopwords = ENGLISH_STOP_WORDS.union(['@', '<br />']) #porter = PorterStemmer() vect = CountVectorizer(max_features=1000, ngram_range=(1, 3), stop_words=my_stopwords, tokenizer=LemmaTokenizer()) #vect.fit(df_train.text) X = vect.fit_transform(df_train.text) X_test = vect.fit_transform(df_test.text) # Transform to an array my_array = X.toarray() my_array_test = X_test.toarray() # Transform back to a dataframe, assign column names X_df = pd.DataFrame(my_array, columns=vect.get_feature_names())
globalDataNum = 100 train_data = pd.read_csv('train_set.csv', sep="\t") test_data = pd.read_csv('test_set.csv', sep="\t") train_data = train_data[0:globalDataNum] test_data = test_data[0:globalDataNum] categories = train_data.Category ids = train_data.Id compons = [2, 3, 5, 7, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100] myData = train_data['Content'] + 5 * train_data['Title'] #for question #4 - initialization metrics_all = [[0 for x in range(5)] for y in range(4)] #adding English stopwords eng_stop_words = ENGLISH_STOP_WORDS myStopWords = {'yes', 'just', "don't", 'didn'} eng_stop_words = ENGLISH_STOP_WORDS.union(myStopWords) # #set(categories) le = preprocessing.LabelEncoder() le.fit(categories) y = le.transform(categories) set(y) set(le.inverse_transform(y)) count_vectorizer = CountVectorizer(stop_words=eng_stop_words) #count_vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,min_df=2,stop_words=eng_stop_words,use_idf=True) X = count_vectorizer.fit_transform(myData) """ svd = TruncatedSVD(100) lsa = make_pipeline(svd, Normalizer(copy=False)) X_train_lsa = lsa.fit_transform(X)
#!/usr/bin/env python2 # -*- coding: utf-8 -*- from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS import json import pandas as pd import matplotlib.pyplot as plt from wordcloud import WordCloud with open('extraStopWords.json','r') as extraStopWords: extraStopWords = json.load(extraStopWords) stopWords = ENGLISH_STOP_WORDS.union(extraStopWords) categories = ['Politics','Film','Football','Business','Technology'] df = pd.read_csv('./datasets/train_set.csv', sep='\t') for category in categories: print("Creating word cloud for: " + category + "." ) c_df = df[(df['Category']==category)] content = ' '.join(c_df['Title'].iloc[i] + ' ' + c_df['Content'].iloc[i] for i in range(len(c_df))) wordcloud = WordCloud(background_color="white", stopwords=stopWords).generate(content) plt.imsave('WordCloud_For:_'+category+'_.png', wordcloud) print("Done!")
import numpy as np import pandas as pd from sklearn.decomposition import LatentDirichletAllocation from sklearn.preprocessing import StandardScaler from sklearn.externals import joblib from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, CountVectorizer from sklearn.metrics.pairwise import cosine_distances, euclidean_distances from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.cluster import KMeans #import matplotlib.pyplot as plt # pd.options.display.max_columns = 30 stop_words = ENGLISH_STOP_WORDS.union({'king','german','brau','james',\ 'brewery','company','brewing','house','bock','style','scotch','california','oktoberfest',\ 'wee','special','english','american','hefeweizen','old','common','gose'}) scaler = StandardScaler() class TopicModeler(object): """ Topic Modeler --------- model : type of sklearn model to use (currently set up for LDA but could be extended to NMF and others) vectorizer : sklearn vectorizer (currently set up for CountVectorizer but could be extended to use TfIDF) distance_func : sklearn.pairwise function for determining distance between documents """ def __init__(self, model, vectorizer, distance_func=cosine_distances): self.model = model self.text = None self.names = None
continue for spam_file in files: print('file: {}'.format(spam_file)) spam_df = pd.read_csv(os.path.join(subdir, spam_file), encoding='latin-1') spam_df.dropna(inplace=True) spam_df['label'] = 1 df = pd.concat([spam_df, ham_df]) del spam_df df = df.sample(frac=1) additional_stop_words = ['enron','vince','louise','attached','2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016'] additional_stop_words += ['koi8', 'http', 'windows', 'utf', 'nbsp', 'bruceg'] more_words_file = open(os.path.join(os.getcwd(),'Results','features','removed_features_list.dmp')) more_words = more_words_file.readlines() more_words_file.close() additional_stop_words += more_words start_time = time.time() #vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS.union(additional_stop_words), ngram_range=(1, 2)) vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS.union(additional_stop_words)) y_train = df['label'] X_train = vectorizer.fit_transform(df['content']) del df classifier = LogisticRegression() classifier.fit(X_train, y_train) print('Training time = {}'.format(time.time() - start_time)) del X_train, y_train # print('Informative features') show_most_informative_features(vectorizer, classifier, spam_file)
import re import numpy as np import pandas as pd from sklearn.decomposition import LatentDirichletAllocation from sklearn.externals import joblib from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, CountVectorizer from sklearn.metrics.pairwise import cosine_distances, euclidean_distances from sklearn.model_selection import GridSearchCV, train_test_split from debate_cleaning import * from sklearn.feature_extraction.text import TfidfVectorizer pd.options.display.max_columns = 40 stop_words = ENGLISH_STOP_WORDS.union({'redirects', 'mentions', 'locations','ve','know','don','way','think','going','just', 'said','got','like','need','say','ll','america','want','sure','make','come','right', 'let','did','look', 'actually','lot','does','people','fact','time','president', 'country','united','states','american','trump'}) class TopicModeler(object): def __init__(self, model, vectorizer, distance_func=cosine_distances): self.model = model self.text = None self.titles = None self.vectorizer = vectorizer self.feature_names = None self.doc_probs = None self.distance_func = distance_func self.word_vec = None def set_feature_names(self):
b = get_article(records[num]) val.append(b) df = build_df(val) #TODO stem/lemmatize stop_words =set() stop_words = stop_words.union({'house','just', 'like', 'did', 'time', 'saw', 'right', 'left', 'road', 'county', 'year', 'road','said', 'area', 'nt', 'woods', 'heard', '2009', '2012', '2011', '2013', '2009', 'km', '07', '09', 'didnt', 'got', 'went', 'know'}) stop_words = stop_words.union(set(df['year'])) stop_words = stop_words.union(set(df['season'])) stop_words = stop_words.union(set(df['month'])) stop_words = stop_words.union(set(df['state'])) stop_words = stop_words.union(set(df['county'])) stop_words =ENGLISH_STOP_WORDS.union(stop_words) lemmer=WordNetLemmatizer() # tokenized = [word_tokenize(content.lower()) for content in corpus] # docs = [[word for word in words if word not in stop_words] for words in tokenized] # corp_lem = [wordnet.lemmatize(word) for word in word_tokenize(corpus.lower())] n_corp =[] for i in corpus: n_corp.append(re.sub('[^A-Za-z0-9]+', ' ', i).lower()) corp=[] for i in n_corp: corp.append(lemmer.lemmatize(i)) vectorizer = TfidfVectorizer(stop_words =stop_words) X = vectorizer.fit_transform(corpus)
def _remove_stopwords(self): """删除关键词中的停止词""" stop_words = sklearn_stopwords.union(nltk_stopwords.words('english')) self.terms = self.terms[~(self.terms['name'].isin(stop_words))] self.paper_term = self.paper_term[self.paper_term['term_id'].isin(self.terms.index)]
skip_files = ['2012_spam.csv', '2013_spam.csv', 'ham.csv', 'ham_latin.csv'] for subdir, dirs, files in os.walk(yearly_spam_folder): if dirs != []: continue for spam_file in files: if spam_file not in skip_files: eprint('file: {}'.format(spam_file)) spam_df = pd.read_csv(os.path.join(subdir, spam_file), encoding='latin-1') spam_df.dropna(inplace=True) spam_df['label'] = 1 df = pd.concat([spam_df, ham_df]) del spam_df df = df.sample(frac=1) additional_stop_words = ['enron','vince','louise','attached','2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016'] additional_stop_words += ['koi8', 'http', 'windows', 'utf', 'nbsp', 'bruceg'] start_time = time.time() vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS.union(additional_stop_words), ngram_range=(1, 2)) y_train = df['label'] X_train = vectorizer.fit_transform(df['content']) del df classifier = LogisticRegression() classifier.fit(X_train, y_train) eprint('Training time = {}'.format(time.time() - start_time)) del X_train, y_train # print('Informative features') show_most_informative_features(vectorizer, classifier, spam_file)
from sklearn.model_selection import StratifiedKFold from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from time import time import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier from sklearn.pipeline import Pipeline import numpy as np #add some basic stopwords stopwords = set() my_words = ["said", "say", "says"] stopwords = ENGLISH_STOP_WORDS.union(my_words) print "RandomForests..." #load the datasets train_data = pd.read_csv('./datasets/train_set.csv', sep="\t") X = train_data["Content"] #Transform Category from strings to numbers from 0-4 le = preprocessing.LabelEncoder() le.fit(train_data["Category"]) y = le.transform(train_data["Category"]) #Split the train set by preserving the percentage of samples for each class. n_folds = 10 folds = StratifiedKFold(n_splits=n_folds)
def crossValidation(self, model_tag): scores = [] f1_scores = [] skf = StratifiedKFold(n_splits=5, shuffle=True) additional_stop_words = [ 'https', 'http', 'amp', 'com', 'reddit', 'www' ] for train_index, test_index in skf.split(self.train_x, self.train_y): if model_tag == 0: model = Pipeline([ ("ngram", CountVectorizer(stop_words=ENGLISH_STOP_WORDS.union( additional_stop_words), ngram_range=(1, 3), min_df=2, max_features=1000)), ("tfidf", TfidfTransformer()), ("clf", MultinomialNB()) ]) elif model_tag == 1: model = Pipeline([ ("ngram", CountVectorizer(stop_words=ENGLISH_STOP_WORDS.union( additional_stop_words), ngram_range=(1, 3), min_df=2, max_features=1000)), ("tfidf", TfidfTransformer()), ("clf", SVC(C=1.0, gamma='scale', kernel='linear')) ]) elif model_tag == 2: model = Pipeline([ ("ngram", CountVectorizer(stop_words=ENGLISH_STOP_WORDS.union( additional_stop_words), ngram_range=(1, 3), min_df=2, max_features=1000)), ("tfidf", TfidfTransformer()), ("clf", AdaBoostClassifier()) ]) elif model_tag == 3: model = Pipeline([ ("ngram", CountVectorizer(stop_words=ENGLISH_STOP_WORDS.union( additional_stop_words), ngram_range=(1, 3), min_df=2, max_features=1000)), ("tfidf", TfidfTransformer()), ("clf", SGDClassifier(loss='hinge', penalty='l1', alpha=1e-3, max_iter=1000, tol=1e-3)) ]) # SVC(C=1.0, kernel='linear', gamma='auto') # LinearSVC(penalty='l1', dual= False, max_iter=1000) # SVC(C=1.0, kernel='sigmoid', gamma='scale') model.fit(self.train_x[train_index], self.train_y[train_index]) predicted = model.predict(self.train_x[test_index]) scores.append(accuracy_score(predicted, self.train_y[test_index])) f1_scores.append(f1_score(self.train_y[test_index], predicted)) # Start: This part is used for error analysis # count = 0 # error_text = [] # for k in range(len(predicted)): # if predicted[k] != self.train_y[test_index][k]: # count += 1 # error_text.append((k, self.train_x[test_index][k])) # if count == 10: # break # # for error in error_text: # print(error) # End # Start: This part is used for top 20 feature output #self.print_top20(model.named_steps['ngram'],model.named_steps['clf']) features_matrix = model.named_steps['ngram'].fit_transform( self.train_x[train_index], self.train_y[train_index]) top20_best = SelectKBest(chi2, k=20) top20_best.fit_transform(features_matrix, self.train_y[train_index]) feature_names = model.named_steps['ngram'].get_feature_names() top20_index = [ i for i, x in enumerate(top20_best.get_support()) if x ] # print("Top 20 features : %s" % (", ".join(feature_names[i] for i in top20_index))) # End print("\nF1-measure: ", mean(f1_scores)) return (scores)
def removeStopwordsNLTKSklearn(document): # dumb wrapper for just showing the union method stopwords = sklearn_stop_words.union( nltk.corpus.stopwords.words('english')) return removeStopwords(stopwords, document)
def get_stop_words(self): f = open('names.txt') a = f.read() b = [x.strip() for x in a.split(',')] stop_words = ENGLISH_STOP_WORDS.union(b) return stop_words
"%22&page=1" SUS_DEV_URL = "https://www.federalregister.gov/api/v1/documents.json?conditions%5Bagencies%" \ "5D%5B%5D=commerce-department&conditions%5Bagencies%5D%5B%5D=defense-department" \ "&conditions%5Bagencies%5D%5B%5D=national-aeronautics-and-space-administration" \ "&conditions%5Bagencies%5D%5B%5D=health-and-human-services-department" \ "&conditions%5Bagencies%5D%5B%5D=transportation-department" \ "&conditions%5Bterm%5D=%22sustainable+development%22&page=1" IOT_PATH = "files/data/iot.pkl" SUS_DEV_PATH = "files/data/sus_dev.pkl" # LDADE TOKEN_PATTERN = re.compile(r"(?u)\b[a-zA-Z]{2}[a-zA-Z]+\b") ITERATIONS = 100 ALPHA = None BETA = None STOP_WORDS = ENGLISH_STOP_WORDS.union(['software', 'engineering']) N_TOPICS = 10 RANDOM_STATE = 1 AGENCY_MAP = { 'Transportation Department': 'DOT', 'Federal Transit Administration': 'FTA', 'Commerce Department': 'DOC', 'International Trade Administration': 'ITA', 'Economic Development Administration': 'EDA', 'National Oceanic and Atmospheric Administration': 'NOAA', 'Federal Highway Administration': 'FHWA', 'Interior Department': 'DOI', 'Fish and Wildlife Service': 'FWS', 'Defense Department': 'DOD', 'Navy Department': 'USN', 'Health and Human Services Department': 'HHS',
def add_stop_words(self): if self.stop_words is not None: words = self._split_on_spaces(self.stop_words) self.stop_words = ENGLISH_STOP_WORDS.union(words)