from sklearn.linear_model import Ridge from scipy.sparse import hstack df=pd.read_csv('salary-train.csv') df_test=pd.read_csv('salary-test-mini.csv') target=df['SalaryNormalized'] df['FullDescription'] = df['FullDescription'].str.lower() df['FullDescription'] = df['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True) df_test['FullDescription'] = df_test['FullDescription'].str.lower() df_test['FullDescription'] = df_test['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True) df=df.drop(['SalaryNormalized'], axis=1) df_test=df_test.drop(['SalaryNormalized'], axis=1) vectorizer = TfidfVectorizer(min_df=5) X = vectorizer.fit_transform(df['FullDescription']) X_test=vectorizer.transform(df_test['FullDescription']) enc = DictVectorizer() for row in df.loc[df.ContractTime.isnull(), 'ContractTime'].index: df.at[row, 'ContractTime'] = 'sad23' for row in df_test.loc[df_test.ContractTime.isnull(), 'ContractTime'].index: df_test.at[row, 'ContractTime'] = 'sad23' X_train_categ = enc.fit_transform(df[['LocationNormalized', 'ContractTime']].to_dict('records')) X_test_categ= enc.transform(df_test[['LocationNormalized', 'ContractTime']].to_dict('records')) matr=hstack([X,X_train_categ]) matr1=hstack([X_test,X_test_categ])
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from sklearn.metrics import adjusted_rand_score captions = [] caption_file = open("cap.txt", encoding="utf8") for caption in caption_file: captions.append(caption.split(' ', 1)[1]) vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform(captions) true_k = 5 model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) model.fit(X) print("Top terms per cluster:") order_centroids = model.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(true_k): print("Cluster %d:" % i), for ind in order_centroids[i, :10]: print(' %s' % terms[ind]), print("\n") print("Prediction") Y = vectorizer.transform(["FLOOD WATERS CHURNING JUST UNDER THE DUCK RIVER BRIDGE ON HIGHWAY 50 NEAR I-40 EXIT 148."]) prediction = model.predict(Y) print("FLOOD WATERS CHURNING JUST UNDER THE DUCK RIVER BRIDGE ON HIGHWAY 50 NEAR I-40 EXIT 148.") print(prediction)
def create_word_tf_idf_for_ip(df, whole_df): """ word level tf-idf """ tfidf_vector = TfidfVectorizer(analyzer='word', max_features=5000) tfidf_vector.fit(whole_df['text']) x_test_tfidf = tfidf_vector.transform(df['text']) return x_test_tfidf
authors_file = "../text_learning/your_email_authors.pkl" word_data = pickle.load(open(words_file, "r")) authors = pickle.load(open(authors_file, "r")) ### test_size is the percentage of events assigned to the test set (the ### remainder go into training) ### feature matrices changed to dense representations for compatibility with ### classifier functions in versions 0.15.2 and earlier #from sklearn import cross_validation from sklearn.model_selection import train_test_split features_train, features_test, labels_train, labels_test = train_test_split( word_data, authors, test_size=0.1, random_state=42) from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train = vectorizer.fit_transform(features_train) features_test = vectorizer.transform(features_test).toarray() ### a classic way to overfit is to use a small number ### of data points and a large number of features; ### train on only 150 events to put ourselves in this regime features_train = features_train[:150].toarray() labels_train = labels_train[:150] ### your code goes here from sklearn import tree from sklearn.metrics import accuracy_score clf = tree.DecisionTreeClassifier()
def tfidf_vec(corpus): tfidf = TfidfVectorizer() train_vec = tfidf.fit_transform(corpus) # for test data # tfidf.transform(['ya Allah meri sister Affia ki madad farma', 'khud chahta a is umar main shadi']) return train_vec, tfidf
# train call_txt_tr = [] for i, row in trainLabels.iterrows(): call_txt_tr.append(' '.join( get_call_list('{}stdcall_grepper/'.format(feats_path) + row['Id']))) # train call_txt_te = [] for i, row in sampleSubmission.iterrows(): call_txt_te.append(' '.join( get_call_list('{}stdcall_grepper/'.format(feats_path) + row['Id']))) logging.info('-> vectorizing...') from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer vect = TfidfVectorizer(max_features=10000) vect.fit(call_txt_tr + call_txt_te) X_call_tr = vect.transform(call_txt_tr) X_call_te = vect.transform(call_txt_te) logging.info('-> apply NMF...') from sklearn.decomposition import TruncatedSVD, NMF from scipy import sparse nmf = NMF(n_components=10, sparseness='data') nmf.fit(sparse.vstack([X_call_tr, X_call_te])) X_calls_nmf_tr = nmf.transform(X_call_tr) X_calls_nmf_te = nmf.transform(X_call_te) # funcs
from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer clf_1 = Pipeline([ ('vect', CountVectorizer()), ('clf', MultinomialNB()), ]) clf_2 = Pipeline([ ('vect', HashingVectorizer(non_negative=True)), ('clf', MultinomialNB()), ]) clf_3 = Pipeline([ ('vect', TfidfVectorizer()), ('clf', MultinomialNB()), ]) clfs = [clf_1, clf_2, clf_3] """ AMHPC # if defined (_OPENMP) pragma omp parallel for # endif """ for clf in clfs: evaluate_cross_validation(clf, X_train, y_train, 5)
if word in stopwords: continue else: newSent.append(word) return newSent ''' word2vec by tfidf ''' corpus = [] for i in range(len(mydata)): corpus.append(' '.join(sent2word(mydata['data'][i]))) vectorizer = TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b') tfidf = vectorizer.fit_transform(corpus) print('the num of doc: %d' % tfidf.shape[0]) print('the size of dict: %d' % tfidf.shape[1]) X = np.array(tfidf.todense()) #把稀疏矩阵输出成真实矩阵 y = np.array(mydata['label']) y_crowd = np.array(mydata['crowd3']) print(X.shape) ''' disrupt the order of the data ''' rng = np.random.RandomState(0) #创建随机数生成器 indices = np.arange(len(mydata)) #生成固定长度数组 rng.shuffle(indices) #随机变换数组内元素顺序 X = X[indices[:]]
explained_var.append(pca.explained_variance_ratio_.sum()) if pca.explained_variance_ratio_.sum() > 0.6: break pca_train = PCA(n_components=260) pca_train.fit(X_train) pca_test = pca.transform(X_test) pca_train = pca.transform(X_train) model = KMeans(n_clusters=5, max_iter=100) clustered = model.fit(pca_train) labels_pred = model.predict(pca_test) metrics.fowlkes_mallows_score(y_test, labels_pred) #Create TF-IDF with no dimensionality reduction data_Tfidf = pd.Series([' '.join(data) for data in data]) vectorizer = TfidfVectorizer() data_Tfidf = vectorizer.fit_transform(data_Tfidf).toarray() vectorizer = TfidfVectorizer() data_Tfidf = vectorizer.fit_transform(data).toarray() #Split train test stratSplit = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) for train_index, test_index in stratSplit.split(data_Tfidf, label): X_train, X_test = data_Tfidf[train_index], data_Tfidf[test_index] y_train, y_test = label[train_index], label[test_index] #Kmeans on TF-IDF with no dimensionality reduction model = KMeans(n_clusters=5, max_iter=100) clustered = model.fit(X_train) labels_pred = model.predict(X_test)
def hierarchical_clustering(datasetDir, preprocessing, distance): all_data = datasets.load_files(datasetDir, description=None, load_content=True, encoding='utf-8', shuffle=False) prefix = ClusteringDir2 + "\\" + preprocessing + "_" + distance """ Apply Tf-idf vectorizer with stop words """ count_vectorizer = TfidfVectorizer(stop_words='english') """ Learn vocabulary and tf-idf, return term-document matrix. """ X = count_vectorizer.fit_transform(raw_documents=all_data.data).toarray() """ Apply Dimensionality reduction using truncated SVD (aka LSA). """ svd = TruncatedSVD(n_components=200) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) data = lsa.fit_transform(X) labels = plot_clusters(data, cluster.AgglomerativeClustering, preprocessing, distance, (), { 'n_clusters': n_clusters, 'linkage': 'complete', 'affinity': distance }) clustering_file = prefix + '_clustering_result.data' clustering_file_handle = open(clustering_file, 'w', encoding='utf-8') for index in range(0, 10): sub_corpus = [] corpus_files = [] # cluster_label = [] for key, label in enumerate(labels): if index == label: sub_corpus.append(all_data.data[key]) corpus_files.append(all_data.filenames[key]) top_n_words, _ = get_top_n_words_n_que(sub_corpus, 10) top_n_words = np.array(top_n_words) clustering_file_handle.write("cluster %d, label %s\n" % (index, list(top_n_words[:, 0]))) for file in corpus_files: clustering_file_handle.write("\t %s\n" % file) # print(all_data.filenames[key]) # Plot silhouette score sil = [] for n_cluster in range(4, 30): model = cluster.AgglomerativeClustering(n_clusters=n_cluster, affinity=distance, linkage='complete').fit(X) labels = model.labels_ model.__dict__ sil.append(silhouette_score(X, labels, metric='euclidean')) # model = KMeans(random_state=42, n_clusters=n_cluster) # Svisualizer = SilhouetteVisualizer(model) # Svisualizer.fit(X) # Fit the data to the visualizer # Svisualizer.poof() # Draw/show/poof the data # plt. plt.plot(list(range(4, 30)), sil) plt.grid(True) plt.savefig(prefix + "_sihouette_score.png") plt.close() """ Plot the hierarchical clustering as a dendrogram """ plt.figure(figsize=(10, 7)) plt.title("Hierarchical Clustering Dendograms") dend = shc.dendrogram(shc.linkage(data, method='complete')) plt.savefig(prefix + "_hierarchical clustering_Dendograms.png") plt.close() top_n_words, que_n_words = get_top_n_words_n_que(all_data.data, top_word_count) """ Output n top frequent words into file """ top_n_words_file = prefix + "top_n_words.data" out_filepath_handle = open(top_n_words_file, "w") word_names = [] word_freqs = [] reverse_freqs = [] for word in top_n_words: word_names.append(word[0]) word_freqs.append(word[1]) reverse_freqs.append(word[1]) out_filepath_handle.write(str(word) + '\n') out_filepath_handle.close() """ visualize the n top frequent words """ index = np.arange(0, top_word_count) reverse_freqs.reverse() word_names.reverse() plt.barh(index, reverse_freqs) plt.yticks(index, word_names) plt.title(str(top_word_count) + " top frequent words") plt.ylabel("words") plt.xlabel("frequency") plt.savefig(prefix + "_" + str(top_word_count) + " top frequent words.png") plt.close() word_freqs = np.array(reverse_freqs) cooccurrence_matrix = np.outer(word_freqs, word_freqs) ax = sns.heatmap(cooccurrence_matrix, linewidth=0.1) plt.yticks(index, word_names, rotation='horizontal') plt.xticks(index, word_names, rotation='vertical') plt.title("Words Co-occurrence") plt.savefig(prefix + "_Co-occurrence.png") plt.close()
df_test = pd.read_csv(os.path.join(DATA_PATH, 'q_test.csv')) df_val = df_test.head(100) df_test = df_test.drop(df_test.head(100).index) # Not to use the validation data used in 5.1 for model selection dict_latent_traits = pickle.load(open(os.path.join(DATA_PATH, 'known_latent_traits.p'), "rb")) # define latent traits calibrator (known latent traits) latent_traits_calibrator = KnownParametersCalibrator(dict_latent_traits) for min_df in np.arange(0.00, 0.11, 0.02): for max_df in np.arange(0.90, 1.01, 0.02): file = open("outputs/5_1_model_selection_LR_mindf_%.2f_maxdf_%.2f.txt" % (min_df, max_df), 'w') file.write("MIN_DF = %.2f - MAX DF = %.2f" % (min_df, max_df)) # pipeline difficulty vec_b = TfidfVectorizer(stop_words='english', preprocessor=preproc, min_df=min_df, max_df=max_df) pipe_b = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([ IRFeaturesComponent(vec_b, concatenate_correct=True, concatenate_wrong=True), LinguisticFeaturesComponent(), ReadabilityFeaturesComponent(), ]), RegressionModule([ SklearnRegressionComponent(LinearRegression(), latent_trait_range=B_RANGE) ]) ) # pipeline discrimination vec_a = TfidfVectorizer(stop_words='english', preprocessor=preproc, min_df=min_df, max_df=max_df) pipe_a = FeatureEngAndRegressionPipeline( FeatureEngineeringModule([ IRFeaturesComponent(vec_a, concatenate_correct=True, concatenate_wrong=True),
vectorizer = CountVectorizer(max_features=2000, min_df=3, max_df=0.6, stop_words=stopwords.words('english')) X = vectorizer.fit_transform(corpus).toarray() # BoW to TF-IDF from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer() X = transformer.fit_transform(X).toarray() # TF-IDF Vectorizer # so we don't need to countVectorizer and TfidfTransformer from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(max_features=2000, min_df=3, max_df=0.6, stop_words=stopwords.words('english')) X = vectorizer.fit_transform(corpus).toarray() # creating training and test set from sklearn.model_selection import train_test_split text_train, text_test, sent_train, sent_test = train_test_split(X, y, test_size=0.2, random_state=0) # train data with logistic regression # from sklearn.linear_model import LogisticRegression # classifier = LogisticRegression() # classifier.fit(text_train, sent_train)
seed = 1024 np.random.seed(seed) from config import path ft = ['question1_distinct_unigram_question2_distinct_unigram'] train = pd.read_csv(path+"train_cooccurrence_distinct.csv")[ft] test = pd.read_csv(path+"test_cooccurrence_distinct.csv")[ft] len_train = train.shape[0] max_features = None ngram_range = (1,1) min_df = 3 print('Generate tfidf') feats= ['question1_distinct_unigram_question2_distinct_unigram'] vect_orig = TfidfVectorizer(max_features=max_features,ngram_range=ngram_range, min_df=min_df) corpus = [] for f in feats: train[f] = train[f].astype(str) test[f] = test[f].astype(str) corpus+=train[f].values.tolist() vect_orig.fit( corpus ) for f in feats: train_tfidf = vect_orig.transform(train[f].values.tolist()) test_tfidf = vect_orig.transform(test[f].values.tolist())
def tf_idf(message): tfidfVectorizer = TfidfVectorizer() term_matrix = tfidfVectorizer.fit_transform(message) pd.set_option('display.max_columns', None) features = pd.DataFrame(term_matrix.toarray(),columns=tfidfVectorizer.get_feature_names()) return features, tfidfVectorizer
def TFIDF(X_train, X_test, MAX_NB_WORDS=75000): vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS) X_train = vectorizer_x.fit_transform(X_train).toarray() X_test = vectorizer_x.transform(X_test).toarray() print("tf-idf with", str(np.array(X_train).shape[1]), "features") return (X_train, X_test)
train_words_list=train_words_list1+train_words_list2+train_words_list3+train_words_list4 train_labels=train_labels1+train_labels2+train_labels3+train_labels4 #测试数据 test_words_list1, test_labels1 = loadfile('data/test/女性', '女性') test_words_list2, test_labels2 = loadfile('data/test/体育', '体育') test_words_list3, test_labels3 = loadfile('data/test/文学', '文学') test_words_list4, test_labels4 = loadfile('data/test/校园', '校园') test_words_list=test_words_list1+test_words_list2+test_words_list3+test_words_list4 test_labels=test_labels1+test_labels2+test_labels3+test_labels4 stop_words=open('stopword.txt','r',encoding='utf-8').read() #列表头部\ufeff处理 stop_words=stop_words.encode('utf-8').decode('utf-8-sig') #根据分隔符分离 stop_words=stop_words.split('\n') #计算单词权重 tf=TfidfVectorizer(stop_words=stop_words,max_df=0.5) train_features=tf.fit_transform(train_words_list) test_features=tf.transform(test_words_list) #多项式贝叶斯分类器 from sklearn.naive_bayes import MultinomialNB clf=MultinomialNB(alpha=0.001).fit(train_features,train_labels) predicted_labels=clf.predict(test_features) #计算准确率 print('准确率为:',metrics.accuracy_score(test_labels,predicted_labels))
def main(): print("Reading Training Data") training = read_training_data("../data/train_with_test.csv") nb = MultinomialNB() tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), token_pattern=ur'\b\w+\b', stop_words=None, min_df=3) tfidf_features = tfidf_vectorizer.fit_transform(training["data"]) lr = LogisticRegression(C=.1, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', tol=0.0001) bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=ur'\b\w+\b', stop_words=None, min_df=3, binary=True) text_features = bigram_vectorizer.fit_transform(training["data"]) feature_functions = [ get_words_upper, get_personal_refs, get_word_count, get_common_insults, get_common_swear_words, get_letters_upper, get_exaggeration ] features = extract_features(training["data"], feature_functions) lr.fit(text_features, training["labels"]) lr_preds = lr.predict_proba(text_features) nb.fit(tfidf_features, training["labels"]) nb_preds = nb.predict_proba(tfidf_features) rf = RandomForestClassifier(n_estimators=500) rf.fit(features, training["labels"]) rf_preds = rf.predict_proba(features) gb_features = numpy.empty((len(lr_preds), 3)) for i in range(len(lr_preds)): gb_features[i][0] = (lr_preds[i][1]) gb_features[i][1] = (rf_preds[i][1]) gb_features[i][2] = (nb_preds[i][1]) gb = GradientBoostingClassifier(n_estimators=200) gb.fit(gb_features, training["labels"]) print("Reading Test Data") test = read_final_test_data("../data/impermium_verification_set.csv") text_features_test = bigram_vectorizer.transform(test["data"]) tfidf_features_test = tfidf_vectorizer.transform(test["data"]) features = extract_features(test["data"], feature_functions) lr_preds = lr.predict_proba(text_features_test) rf_preds = rf.predict_proba(features) nb_preds = nb.predict_proba(tfidf_features_test) gb_features = numpy.empty((len(lr_preds), 3)) lr_pred = [] rf_pred = [] gb_pred = [] nb_pred = [] for i in range(len(lr_preds)): gb_features[i][0] = (lr_preds[i][1]) gb_features[i][1] = (rf_preds[i][1]) gb_features[i][2] = (nb_preds[i][1]) lr_pred.append(lr_preds[i][1]) rf_pred.append(rf_preds[i][1]) nb_pred.append(nb_preds[i][1]) predictions = gb.predict_proba(gb_features) output_file = "submission.csv" print("Writing submission to %s" % output_file) f = open(output_file, "w") f.write("id,insult\n") for i in range(len(test["data"])): f.write("%d,%f\n" % (test["ids"][i], predictions[i][1])) gb_pred.append(predictions[i][1]) f.close()
print( '\n-------------Naive Bayes Classification with BOW Vectorization-------------' ) accuracyDict["BOW-NB"] = NaiveBayesClassification(trainX, trainY, testX, testY, le) print( '\n-------------K Nearest Neighbor Classification with BOW Vectorization-------------' ) accuracyDict["BOW-knn"] = KnnClassification(trainX, trainY, testX, testY, le) # endregion # - #### Tf-idf vectorization # region tfIdfVectorizer = TfidfVectorizer(max_features=1000) trainX = tfIdfVectorizer.fit_transform(trainDataSet['CONTENT']) testX = tfIdfVectorizer.transform(testDataSet['CONTENT']) print('-------------SVM Classification with TfIdf Vectorization-------------') accuracyDict["TfIdf-SVM"] = SvmClassification(trainX, trainY, testX, testY, le) print( '\n-------------Random Forests Classification with TfIdf Vectorization-------------' ) accuracyDict["TfIdf-RandomForests"] = RandomForestClassification( trainX, trainY, testX, testY, le) print( '\n-------------Naive Bayes Classification with TfIdf Vectorization-------------'
return text; filt_X=raw_X.apply(filtering); X=filt_X.apply(wordToken); #Plotting histogram for the length of the comments Len_X=X.apply(len); plt.hist(Len_X,bins=50); plt.show(); #print(Len_X); #Splitting data into two sets: Train and Test X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3); #Transforming text to feature vectors using TfidfVectorizer TV=TfidfVectorizer(max_features=5000); X_train=TV.fit_transform(X_train); X_test=TV.transform(X_test); column_names=Y_train.columns; #Training model using training data and calculating the accuracy from sklearn.linear_model import LogisticRegression; LR=LogisticRegression(C=12.0); for x in column_names: target=Y_train[x]; LR.fit(X_train,target); Y_pred=LR.predict(X_test); Accuracy=accuracy_score(Y_test[x],Y_pred); print("Accuracy for ",x,":",Accuracy);
test_data = pd.DataFrame() test_data['Mail'] = X_test test_data['Class'] = y_test #test_data = test_data.sample(frac = 1) test_data.reset_index(inplace = True, drop = True) ############################################################################## Encoding y_train and y_test ############################################################ encoder = preprocessing.LabelEncoder() y_train = encoder.fit_transform(y_train) y_test = encoder.fit_transform(y_test) # creating the tf_idf vectorizer tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=None) tfidf_vect.fit(data['Mail']) X_train_tfidf = tfidf_vect.transform(X_train) X_test_tfidf = tfidf_vect.transform(X_test) # creating a count vectorizer object count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}') count_vect.fit(data['Mail']) # transform the training and validation data using count vectorizer object X_train_count = count_vect.transform(X_train) X_test_count = count_vect.transform(X_test)
def train(self): vectorDimension = 200 scriptDir = os.path.dirname(__file__) corpusPath = os.path.join(scriptDir, 'data', 'domain', '*.txt') listOfCorpusFiles = sorted(glob.glob(corpusPath)) print("\tProcessing following Corpus files:", *listOfCorpusFiles, sep='\n\t') corpus = [] faq = [] for fileName in listOfCorpusFiles: corpusFile = codecs.open(fileName, 'r', encoding='utf-8') corpus.append(corpusFile.read()) faqPath = os.path.join(scriptDir, 'data', 'faq', '*.txt') listOfFaqFiles = sorted(glob.glob(faqPath)) print("\n\tProcessing following FAQ files:", *listOfFaqFiles, sep='\n\t') for fileName in listOfFaqFiles: faqFile = codecs.open(fileName, 'r', encoding='utf-8').read() i = 1 for line in faqFile.split('\n'): if (line.count('?') > 1): print( "\tSEVERE:Found multiple questions in %s at line %d." % (os.path.basename(fileName), i)) print("\tSEVERE:Aborting the process..!!!") sys.exit("\tAborting...") line = line.replace('$', 'USD') line = line.replace('"', '\'') que, ans = line.split('?') corpus.append(que + ' ?') faq.append(line) i += 1 print('\n\tTotal no of questions for training: %s' % len(corpus)) stopListFile = os.path.join(scriptDir, 'data', 'dictionary', 'stopwords_en.txt') arrayWords = [] stopWords = [] f = codecs.open(stopListFile, 'r', 'utf-8') lines = f.read().split("\n") for line in lines: if line != "": arrayWords.append(line.split(',')) for a_word in arrayWords: for s_word in a_word: if (re.sub(' ', '', s_word)) != "": stopWords.append(s_word) extraStopWords = set(stopWords) stops = set(stopwords.words('english')) | extraStopWords tfidfVec = TfidfVectorizer(corpus, decode_error='ignore', stop_words=stops, ngram_range=(1, 5), tokenizer=m.stemTokenize_2) trainsetIdfVectorizer = tfidfVec.fit_transform(corpus).toarray() vLength = len(trainsetIdfVectorizer[1]) nDimension = vectorDimension if vLength <= vectorDimension: nDimension = vLength - 1 svd = TruncatedSVD(n_components=nDimension, algorithm='randomized', n_iter=15, random_state=42) trainLSA = svd.fit_transform(trainsetIdfVectorizer) picklePath = os.path.join(scriptDir, 'model') fileName = os.path.join(picklePath, 'corpus.m') fileObject = open(fileName, 'wb') pickle.dump(corpus, fileObject) fileObject.close() fileName = os.path.join(picklePath, 'faq.m') fileObject = open(fileName, 'wb') pickle.dump(faq, fileObject) fileObject.close() fileName = os.path.join(picklePath, 'tfidfVec.m') fileObject = open(fileName, 'wb') pickle.dump(tfidfVec, fileObject) fileObject.close() fileName = os.path.join(picklePath, 'svd.m') fileObject = open(fileName, 'wb') pickle.dump(svd, fileObject) fileObject.close() fileName = os.path.join(picklePath, 'trainLSA.m') fileObject = open(fileName, 'wb') pickle.dump(trainLSA, fileObject) fileObject.close()
msg_with_removed_num = ''.join([char for char in msg_with_removed_punc if char not in '1234567890']) #convert from uppercase to lowercase msg_aftr_converted_to_Lowercase = ''.join([char.lower() for char in msg_with_removed_num]) #lemmatization lem_word_tokens = nltk.word_tokenize(msg_aftr_converted_to_Lowercase) lemmatized_message = ''.join([wordnet_lemmatizer.lemmatize(word) for word in lem_word_tokens ]) #stemming stemming_word_tokens = nltk.word_tokenize(lemmatized_message) stemmed_message = ''.join([snowball_stemmer.stem(word) for word in stemming_word_tokens]) #stop words stopwords_tokens = nltk.word_tokenize(stemmed_message) msg_with_removed_stopwords = ''.join([word for word in stopwords_tokens if word not in stopwords.words('english')]) return msg_with_removed_stopwords from sklearn.feature_extraction.text import TfidfVectorizer #for feature extraction vectorizer_new = TfidfVectorizer(analyzer = cleaning_data) spam_tfidfvectorizer_new = vectorizer_new.fit_transform(spam_data["v2"]) X = spam_tfidfvectorizer_new y = spam_data['v1'] #test_train_split from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) from sklearn.naive_bayes import GaussianNB NB_classifier = GaussianNB() NB_classifier_model = NB_classifier.fit(X_train.toarray(),y_train)
print(np.array(labelList)) def main(): process_resume_list() save_model = 'finalized_model.sav' save_vector = 'finalized_vectorizer.sav' if __name__ == '__main__': main() label=np.array(labelList) vectorizer = TfidfVectorizer(analyzer='word', stop_words='english',max_features=250) resumes_train,resumes_test,y_train,y_test=train_test_split(resume_list,label,test_size=0.33,random_state=1) X_train = vectorizer.fit_transform(resumes_train) X_test = vectorizer.fit_transform(resumes_test) X_train_array = X_train.toarray() X_test_array = X_test.toarray() y_test1=y_test.reshape(-1,1) print(vectorizer.get_feature_names()) pickle.dump(vectorizer, open(save_vector, 'wb')) #Implementing Bernoulli Naive Bayes naive_bayes = BernoulliNB(alpha=1.0) naive_bayes.fit(X_train_array, y_train)
test_labels = [] for curr_class in classes: dirname = os.path.join(data_dir, curr_class) for fname in os.listdir(dirname): with open(os.path.join(dirname, fname), 'r') as f: content = f.read() if fname.startswith('cv9'): test_data.append(content) test_labels.append(curr_class) else: train_data.append(content) train_labels.append(curr_class) # Create feature vectors vectorizer = TfidfVectorizer(min_df=5, max_df = 0.8, sublinear_tf=True, use_idf=True) train_vectors = vectorizer.fit_transform(train_data) test_vectors = vectorizer.transform(test_data) clf = MultinomialNB() t0 = time.time() clf.fit(train_vectors, train_labels) t1 = time.time() prediction = clf.predict(test_vectors) t2 = time.time() time_train = t1-t0 time_predict = t2-t1
from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import make_pipeline from joblib import dump data = fetch_20newsgroups() categories = ['comp.windows.x', 'misc.forsale', 'rec.motorcycles'] train = fetch_20newsgroups(subset='train', categories=categories) test = fetch_20newsgroups(subset='test', categories=categories) model = make_pipeline(TfidfVectorizer(), MultinomialNB()) model.fit(train.data, train.target) labels = model.predict(test.data) test.target[0:10] n = len(test.data) acc = [ 1 for i in range(n) if test.target[i] == labels[i] ] print(f'Acc : {sum(acc)*100/n} %') dump(model, 'chatgroup.model') Acc = sum(acc)*100/n dump(Acc, 'acc.model')
# Features which are passwords features = data.values[:, 1].astype('str') # Labels which are strength of password labels = data.values[:, -1].astype('int') # Splitting the dataset into the training and test sets from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=0) # Sequentially apply a list of transforms and a final estimator classifier_model = Pipeline([ ('tfidf', TfidfVectorizer(analyzer='char')), ('logisticRegression', LogisticRegression(multi_class='multinomial', solver='sag')), ]) # Fit the Model classifier_model.fit(X_train, y_train) y_pred = classifier_model.predict(X_test) from sklearn.metrics import confusion_matrix, classification_report cm = confusion_matrix(y_test, y_pred) print(classification_report(y_test, y_pred, digits=4)) print("Confusion Matrix: \n", cm) accuracy = (cm[0][0] + cm[1][1] + cm[2][2]) / (cm[0][0] + cm[0][1] + cm[0][2] + cm[1][0] + cm[1][1] +
from sklearn.naive_bayes import MultinomialNB if 'MLCOMP_DATASETS_HOME' not in os.environ: print "MLCOMP_DATASETS_HOME not set; please follow the above instructions" sys.exit(0) # Load the training set print "Loading 20 newsgroups training set... " news_train = load_mlcomp('20news-18828', 'train') print news_train.DESCR print "%d documents" % len(news_train.filenames) print "%d categories" % len(news_train.target_names) print "Extracting features from the dataset using a sparse vectorizer" t0 = time() vectorizer = TfidfVectorizer(charset='latin1') X_train = vectorizer.fit_transform( (open(f).read() for f in news_train.filenames)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape assert sp.issparse(X_train) y_train = news_train.target print "Loading 20 newsgroups test set... " news_test = load_mlcomp('20news-18828', 'test') t0 = time() print "done in %fs" % (time() - t0) print "Predicting the labels of the test set..." print "%d documents" % len(news_test.filenames) print "%d categories" % len(news_test.target_names)
from time import time from sklearn.pipeline import Pipeline from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV from dabl.search import GridSuccessiveHalving data_train = fetch_20newsgroups(subset="train") data_test = fetch_20newsgroups(subset="test") pipe = Pipeline([('vect', CountVectorizer()), ('clf', LogisticRegression())]) param_grid = { 'vect': [TfidfVectorizer(), CountVectorizer()], 'clf__C': np.logspace(-3, 3, 7), 'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]} print("Parameter grid:") print(param_grid) sh = GridSuccessiveHalving(pipe, param_grid, cv=5) print("Start successive halving") tick = time() sh.fit(data_train.data, data_train.target) print("Training Time Successive Halving", time() - tick) print("Test Score Successive Halving: ", sh.score(data_test.data, data_test.target)) print("Parameters Successive Halving: ", sh.best_params_) gs = GridSearchCV(pipe, param_grid, cv=5)
random_state=2020, test_size=0.1, shuffle=True) def multiclass_logloss(actual, predicted, eps=1e-15): if len(actual.shape) == 1: actual2 = np.zeros((actual.shape[0], predicted.shape[1])) for i, val in enumerate(actual): actual2[i, val] = 1 actual = actual2 clip = np.clip(predicted, eps, 1 - eps) rows = actual.shape[0] vsota = np.sum(actual * np.log(clip)) return -1.0 / rows * vsota tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=4000) tfidf_vectorizer.fit(list(x_train) + list(x_val)) xtrain = tfidf_vectorizer.transform(x_train) xval = tfidf_vectorizer.transform(x_val) xtest = tfidf_vectorizer.transform(test.clean_script.values) xtrain.shape, xval.shape model = LogisticRegression() model.fit(xtrain, ytrain) pred_prob = model.predict_proba(xval) print(multiclass_logloss(yval, pred_prob)) pred_test = model.predict_proba(xtest)
def main(): # loading Data category = ['Accepted', 'Rejected'] doc_to_data = skd.load_files('Dataset/', description=None, categories=category, load_content=True, encoding='ISO-8859-1', random_state=24) # print(doc_to_data.data) # print(doc_to_data.target) X_train, X_test, y_train, y_test = train_test_split(doc_to_data.data, doc_to_data.target, test_size=0.05, random_state=24) # Splitting Data zippedList = list(zip(X_train, y_train)) df = pd.DataFrame(zippedList, columns=['Isnad', 'Class']) custom_stop_words = [ 'Tell', 'Tell us', 'Narrated', 'Messenger', 'Prophet', 'Aisha', 'Division', 'Allah', 'God', 'Lord', 'Allaah', 'He', 'She', 'A', 'They', '(h)', 'We', 'It' ] vector = TfidfVectorizer(encoding='ISO-8859-1', lowercase=False, preprocessor=preprocess, tokenizer=tokenization, min_df=2, max_df=0.5, stop_words=custom_stop_words, sublinear_tf=True, use_idf=True, smooth_idf=True) #counts = vector.fit(X_train) # print(vector.get_feature_names()) #transformer = TfidfTransformer(sublinear_tf=True, use_idf=True, smooth_idf=True) features = vector.fit_transform(X_train).toarray() labels = df.Class models = [ RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0), LinearSVC(), #SVC(C=1.0, kernel='linear', degree=3, gamma='auto'), naive_bayes.BernoulliNB(), SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None), LogisticRegression(penalty='l2', solver='liblinear', C=1, class_weight='balanced', random_state=24, tol=0.000001), KNeighborsClassifier(n_neighbors=3) ] CV = 5 cv_df = pd.DataFrame(index=range(CV * len(models))) entries = [] for model in models: model_name = model.__class__.__name__ accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV) for fold_idx, accuracy in enumerate(accuracies): entries.append((model_name, fold_idx, accuracy)) cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy']) print(cv_df.groupby('model_name').accuracy.mean()) sns.boxplot(x='model_name', y='accuracy', data=cv_df) sns.stripplot(x='model_name', y='accuracy', data=cv_df, size=8, jitter=True, edgecolor="gray", linewidth=2) plot.show()