def fit_tfidf(questionset, get_model = False): tfidf = TV(min_df = 2, analyzer = 'word', stop_words = 'english') tfidf.fit(questionset) vectors = tfidf.transform(questionset) if get_model: return tfidf else: return vectors
def make_array(num_of_pages=100, min_links=5, min_lenth=3000, max_word=1): #создаём и заполняем вектор слов all_pages, pages_titles = wp.give_articles(num_of_pages) vect = TV(tokenizer=LemmaTokenizer(), max_df=int(max_word), min_df=0) X_train = vect.fit_transform(all_pages) labels = numpy.array(vect.get_feature_names()) Arr = X_train.A return pages_titles, labels, Arr
def classify(training_set, testing_set): # GET THE RESPONSE VARIABLES original_set = training_set + testing_set question_text = [] response_var = [] for i in range(0, len(original_set)): question_text.append(original_set[i][0]) response_var.append(original_set[i][1]) # WORD2VEC / TF-IDF REPRESENTATION tfidf = TV(min_df = 2, analyzer='word', stop_words='english') tfidf.fit(question_text) vectors = tfidf.transform(question_text) model = MultinomialNB(alpha = 0.5, fit_prior = True, class_prior = [0.3, 0.5, 0.2]) model.fit(vectors[:len(training_set)], response_var[:len(training_set)]) #prob = model.predict(vectors[len(training_set):]) score = model.score(vectors[len(training_set):], response_var[len(training_set):]) print ("Performance of the Naïve Bayes Classifier", score) return model, tfidf
### use parseOutText to extract the text from the opened email email_text = parseOutText(email) ### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] for w in [ "sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf" ]: if w in email_text: email_text = email_text.replace(w, '') ### append the text to word_data word_data.append(email_text) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris from_data.append(0 if name == 'sara' else 1) email.close() print "emails processed" from_sara.close() from_chris.close() pickle.dump(word_data, open("your_word_data.pkl", "w")) pickle.dump(from_data, open("your_email_authors.pkl", "w")) ### in Part 4, do TfIdf vectorization here from sklearn.feature_extraction.text import TfidfVectorizer as TV vec = TV(stop_words='english') word_data = vec.fit_transform(word_data) vocab_list = vec.get_feature_names() print 'Number of words in the vocabulary is', len(vocab_list)
def vectorize_text(x_train, x_test): vect = TV(strip_accents='unicode', analyzer='word') text_train = vect.fit_transform(x_train) text_test = vect.transform(x_test) # text_train, text_test = pd.DataFrame(text_train), pd.DataFrame(text_test) return text_train, text_test
xTest = [] yTest = [] for J in range(5): if J == index: for L in XkFold[J]: xTest.append(L) yTest.extend(YkFold[J]) else: for L in XkFold[J]: xTrain.append(L) yTrain.extend(YkFold[J]) assert len(xTrain) == len(yTrain) xTrainNew, yTrainNew = balancedTrain(xTrain, yTrain, 'CV') counterList.append(TV(ngram_range=(2, 2), min_df=5)) trainVector = counterList[-1].fit_transform(xTrainNew) testVector = counterList[-1].transform(xTest) selectList.append(SelectKBest(chi2, k=10000)) trainVector = selectList[-1].fit_transform(trainVector, yTrainNew) testVector = selectList[-1].transform(testVector) mreTotal.append(0) for J in clfOption: J.fit(trainVector, yTrainNew) prediction = J.predict(testVector) mreTotal[-1] += mrc(prediction, yTest) index = mreTotal.index(max(mreTotal))
for keyword in stopwords: text = text.replace(keyword, "") ### append the text to word_data word_data.append(text) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name=="sara"": from_data.append(0) else: from_data.append(1) email.close() print "emails processed" from_sara.close() from_chris.close() pickle.dump( word_data, open("your_word_data.pkl", "w") ) pickle.dump( from_data, open("your_email_authors.pkl", "w") ) ### in Part 4, do TfIdf vectorization here #the given stopword is "english" vectorizer=TV(stop_words="english") vectorizer.fir(word_data) vectorizer.transform(word_data) feature_words=vectorizer.get_feature_names() #print out info print "total number of words: ", len(feature_words)
def __init__(self, nc=20, max_df=0.3, min_df=0.035): self.docu = Docu().load() vectorizer = TV(max_df=max_df, min_df=min_df) self.vectors = vectorizer.fit_transform(self.docu.text).todense() self.nc = nc self.model = None
""" stemmer = EnglishStemmer() decoder = lambda x: x.decode('cp850') train['comment_text'] = train['comment_text'].apply( str.lower).apply(wordpunct_tokenize).apply(' '.join).apply(decoder).apply( stemmer.stem) test['comment_text'] = test['comment_text'].apply( str.lower).apply(wordpunct_tokenize).apply(' '.join).apply(decoder).apply( stemmer.stem) """ Part 3. model """ # vec = TV(ngram_range=(1, 2)) vec = TV() model = SVC(kernel='linear', probability=True) # so model will provide predict_proba method # 每个文档使用 vocabulary 中的词的词频表示 train_doc = vec.fit_transform(train['comment_text']) test_doc = vec.transform(test['comment_text']) """ Part 4. Validation """ # run model preds = np.zeros((len(test), len(label_cols))) n_splits = 5 total_losses = []
words.extend([x[:2], x[2:4], x[4:6], x[5:]]) sentence = ' '.join(words) # 空格连接特征词 tokens.append(words) corpus.append(sentence[:]) #print(corpus) #3. 文档计算tfidf向量权重矩阵,生成词汇表 #重要属性与函数: #tv_model.get_feature_names() # 获取特征词 #tv_model.vocabulary_ # 词汇表 #vecs.size # 特征向量总长度 #vecs.shape # 特征向量矩阵维数 from sklearn.feature_extraction.text import TfidfVectorizer as TV #tfidf向量 model = TV() #tfidf向量 vecs_train = model.fit_transform(corpus) # 稀疏矩阵 防止溢出 特征向量矩阵,每行对应一首诗,每列对应一个特征词 vocab_train = model.get_feature_names() vocab_size = len(vocab_train) # 词汇表size #print(f' 词汇表size:{vocab_size}') import numpy as np # 数据量大,计算余弦相似度速度很慢,随机取一个批次进行计算 train_size = len(s_train) # 训练集size: 总32196首 batch_size = 200 # 批次包含文档数 batch_mask = np.random.choice(train_size, batch_size, replace=False) vecs = vecs_train[batch_mask] # 从特征向量矩阵随机抽取batch_size首 #4.计算余弦相似度,生成相似字典,数据集大,余弦相似度速度慢,corpus取子集测试 from scipy.spatial.distance import cosine
#set1 = re.sub('[\'!@#$.,?"&$%*:/<>;()=+~]', ' ', set1) # Deletes all punctuations #set1 = set1.lower().split() set2 = input[2] #set2 = re.sub('[\'!@#$.,?"&$%*:/<>;()=+~]', ' ', set2) #set2 = set2.lower().split() test_originals.append(set1 + set2) #test_originals.append(' '.join(set2)) #test_corpus_set_1.append(set1) #test_corpus_set_2.append(set2) #total_phrases.append(set1+set2) # TF-IDF REPRESENTATION print('Generating TF-IDF Representation...') tfidf = TV(min_df=2, analyzer='word', strip_accents='unicode', tokenizer=tokenize, use_idf=False) originals = train_originals + test_originals tfidf.fit(originals) # This is the slow part! originals = tfidf.transform(originals) # DIMENSIONALITY REDUCTION - SVD print('Reducing dimensions...') svd = TruncatedSVD(n_components=500, n_iter=8) svd.fit(originals) originals = svd.transform(originals) explained_variance = svd.explained_variance_ratio_.sum() print('Explained Variance: ' + str(explained_variance)) # LOGISTIC REGRESSION
Kalam: The main lesson I learned was how multiple technical teams and departments of the government of India could work together for a great mission as an industrial partnership. It was a great experience. India Knowledge@Wharton: You are known to be deeply spiritual. Did you ever feel conflicted, or guilty, about developing missiles and nuclear weapons? Why, or why not? Kalam: I realize that for my country’s development, peace is essential. Peace comes from strength — because strength respects strength. That is how our weaponized missiles were born. You need strength to keep the nation peaceful, so that you can focus on the necessary developmental missions. That is how I see it.""" #cleaning of text -- lemmatizing,stopwords and lowering the text import re from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() sentences = nltk.sent_tokenize(passage) clean_sents = [] for i in range(len(sentences)): temp_sent = re.sub('[^a-zA-Z]', ' ', sentences[i]) temp_sent = temp_sent.lower() temp_sent = temp_sent.split() temp_sent = [ wnl.lemmatize(word) for word in temp_sent if word not in set(stopwords.words('english')) ] temp_sent = ' '.join(temp_sent) clean_sents.append(temp_sent) #creating TF-IDF model from sklearn.feature_extraction.text import TfidfVectorizer as TV tv = TV() X = tv.fit_transform(clean_sents).toarray()