示例#1
0
def fit_tfidf(questionset, get_model = False):
    tfidf = TV(min_df = 2, analyzer = 'word', stop_words = 'english')
    tfidf.fit(questionset)
    vectors = tfidf.transform(questionset)
    
    if get_model:    
        return tfidf
    else:
        return vectors
示例#2
0
def make_array(num_of_pages=100,
               min_links=5,
               min_lenth=3000,
               max_word=1):  #создаём и заполняем вектор слов
    all_pages, pages_titles = wp.give_articles(num_of_pages)
    vect = TV(tokenizer=LemmaTokenizer(), max_df=int(max_word), min_df=0)
    X_train = vect.fit_transform(all_pages)
    labels = numpy.array(vect.get_feature_names())
    Arr = X_train.A
    return pages_titles, labels, Arr
示例#3
0
def classify(training_set, testing_set):
    
    # GET THE RESPONSE VARIABLES
    original_set = training_set + testing_set
    question_text = []
    response_var = []
    for i in range(0, len(original_set)):
        question_text.append(original_set[i][0])
        response_var.append(original_set[i][1])
    
    # WORD2VEC / TF-IDF REPRESENTATION
    tfidf = TV(min_df = 2, analyzer='word', stop_words='english')
    tfidf.fit(question_text)
    vectors = tfidf.transform(question_text)

    model = MultinomialNB(alpha = 0.5, fit_prior = True, class_prior = [0.3, 0.5, 0.2])
    model.fit(vectors[:len(training_set)], response_var[:len(training_set)])
    
    #prob = model.predict(vectors[len(training_set):])
    score = model.score(vectors[len(training_set):], response_var[len(training_set):])
    print ("Performance of the Naïve Bayes Classifier", score)
    
    return model, tfidf
            ### use parseOutText to extract the text from the opened email
            email_text = parseOutText(email)
            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]
            for w in [
                    "sara", "shackleton", "chris", "germani", "sshacklensf",
                    "cgermannsf"
            ]:
                if w in email_text:
                    email_text = email_text.replace(w, '')
            ### append the text to word_data
            word_data.append(email_text)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            from_data.append(0 if name == 'sara' else 1)

            email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump(word_data, open("your_word_data.pkl", "w"))
pickle.dump(from_data, open("your_email_authors.pkl", "w"))

### in Part 4, do TfIdf vectorization here
from sklearn.feature_extraction.text import TfidfVectorizer as TV

vec = TV(stop_words='english')
word_data = vec.fit_transform(word_data)
vocab_list = vec.get_feature_names()
print 'Number of words in the vocabulary is', len(vocab_list)
示例#5
0
def vectorize_text(x_train, x_test):
    vect = TV(strip_accents='unicode', analyzer='word')
    text_train = vect.fit_transform(x_train)
    text_test = vect.transform(x_test)
    # text_train, text_test = pd.DataFrame(text_train), pd.DataFrame(text_test)
    return text_train, text_test
示例#6
0
    xTest = []
    yTest = []

    for J in range(5):
        if J == index:
            for L in XkFold[J]:
                xTest.append(L)
            yTest.extend(YkFold[J])
        else:
            for L in XkFold[J]:
                xTrain.append(L)
            yTrain.extend(YkFold[J])

    assert len(xTrain) == len(yTrain)
    xTrainNew, yTrainNew = balancedTrain(xTrain, yTrain, 'CV')
    counterList.append(TV(ngram_range=(2, 2), min_df=5))
    trainVector = counterList[-1].fit_transform(xTrainNew)
    testVector = counterList[-1].transform(xTest)

    selectList.append(SelectKBest(chi2, k=10000))

    trainVector = selectList[-1].fit_transform(trainVector, yTrainNew)
    testVector = selectList[-1].transform(testVector)

    mreTotal.append(0)
    for J in clfOption:
        J.fit(trainVector, yTrainNew)
        prediction = J.predict(testVector)
        mreTotal[-1] += mrc(prediction, yTest)

    index = mreTotal.index(max(mreTotal))
            for keyword in stopwords:
                text = text.replace(keyword, "")
            ### append the text to word_data
            word_data.append(text)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if name=="sara"":
                from_data.append(0)
            else:
                from_data.append(1)
            email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump( word_data, open("your_word_data.pkl", "w") )
pickle.dump( from_data, open("your_email_authors.pkl", "w") )





### in Part 4, do TfIdf vectorization here
#the given stopword is "english"
vectorizer=TV(stop_words="english")
vectorizer.fir(word_data)
vectorizer.transform(word_data)
feature_words=vectorizer.get_feature_names()
#print out info
print "total number of words: ", len(feature_words)
示例#8
0
 def __init__(self, nc=20, max_df=0.3, min_df=0.035):
     self.docu = Docu().load()
     vectorizer = TV(max_df=max_df, min_df=min_df)
     self.vectors = vectorizer.fit_transform(self.docu.text).todense()
     self.nc = nc
     self.model = None
"""

stemmer = EnglishStemmer()
decoder = lambda x: x.decode('cp850')
train['comment_text'] = train['comment_text'].apply(
    str.lower).apply(wordpunct_tokenize).apply(' '.join).apply(decoder).apply(
        stemmer.stem)
test['comment_text'] = test['comment_text'].apply(
    str.lower).apply(wordpunct_tokenize).apply(' '.join).apply(decoder).apply(
        stemmer.stem)
"""
Part 3. model
"""

# vec = TV(ngram_range=(1, 2))
vec = TV()
model = SVC(kernel='linear',
            probability=True)  # so model will provide predict_proba method

# 每个文档使用 vocabulary 中的词的词频表示
train_doc = vec.fit_transform(train['comment_text'])
test_doc = vec.transform(test['comment_text'])
"""
Part 4. Validation
"""

# run model
preds = np.zeros((len(test), len(label_cols)))

n_splits = 5
total_losses = []
示例#10
0
        words.extend([x[:2], x[2:4], x[4:6], x[5:]])
        sentence = ' '.join(words)  # 空格连接特征词
    tokens.append(words)
    corpus.append(sentence[:])

#print(corpus)

#3. 文档计算tfidf向量权重矩阵,生成词汇表
#重要属性与函数:
#tv_model.get_feature_names()  # 获取特征词
#tv_model.vocabulary_  # 词汇表
#vecs.size   # 特征向量总长度
#vecs.shape  # 特征向量矩阵维数
from sklearn.feature_extraction.text import TfidfVectorizer as TV  #tfidf向量

model = TV()  #tfidf向量
vecs_train = model.fit_transform(corpus)  # 稀疏矩阵 防止溢出 特征向量矩阵,每行对应一首诗,每列对应一个特征词
vocab_train = model.get_feature_names()
vocab_size = len(vocab_train)  # 词汇表size
#print(f'  词汇表size:{vocab_size}')

import numpy as np
# 数据量大,计算余弦相似度速度很慢,随机取一个批次进行计算
train_size = len(s_train)  # 训练集size: 总32196首
batch_size = 200  # 批次包含文档数

batch_mask = np.random.choice(train_size, batch_size, replace=False)
vecs = vecs_train[batch_mask]  # 从特征向量矩阵随机抽取batch_size首

#4.计算余弦相似度,生成相似字典,数据集大,余弦相似度速度慢,corpus取子集测试
from scipy.spatial.distance import cosine
示例#11
0
    #set1 = re.sub('[\'!@#$.,?"&$%*:/<>;()=+~]', ' ', set1)  # Deletes all punctuations
    #set1 = set1.lower().split()
    set2 = input[2]
    #set2 = re.sub('[\'!@#$.,?"&$%*:/<>;()=+~]', ' ', set2)
    #set2 = set2.lower().split()
    test_originals.append(set1 + set2)
    #test_originals.append(' '.join(set2))
    #test_corpus_set_1.append(set1)
    #test_corpus_set_2.append(set2)
    #total_phrases.append(set1+set2)

# TF-IDF REPRESENTATION
print('Generating TF-IDF Representation...')
tfidf = TV(min_df=2,
           analyzer='word',
           strip_accents='unicode',
           tokenizer=tokenize,
           use_idf=False)
originals = train_originals + test_originals
tfidf.fit(originals)  # This is the slow part!
originals = tfidf.transform(originals)

# DIMENSIONALITY REDUCTION - SVD
print('Reducing dimensions...')
svd = TruncatedSVD(n_components=500, n_iter=8)
svd.fit(originals)
originals = svd.transform(originals)
explained_variance = svd.explained_variance_ratio_.sum()
print('Explained Variance: ' + str(explained_variance))

# LOGISTIC REGRESSION
示例#12
0
Kalam: The main lesson I learned was how multiple technical teams and departments of the government of India could work together for a great mission as an industrial partnership. It was a great experience.

India Knowledge@Wharton: You are known to be deeply spiritual. Did you ever feel conflicted, or guilty, about developing missiles and nuclear weapons? Why, or why not?

Kalam: I realize that for my country’s development, peace is essential. Peace comes from strength — because strength respects strength. That is how our weaponized missiles were born. You need strength to keep the nation peaceful, so that you can focus on the necessary developmental missions. That is how I see it."""

#cleaning of text -- lemmatizing,stopwords and lowering the text
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()
sentences = nltk.sent_tokenize(passage)
clean_sents = []

for i in range(len(sentences)):
    temp_sent = re.sub('[^a-zA-Z]', ' ', sentences[i])
    temp_sent = temp_sent.lower()
    temp_sent = temp_sent.split()
    temp_sent = [
        wnl.lemmatize(word) for word in temp_sent
        if word not in set(stopwords.words('english'))
    ]
    temp_sent = ' '.join(temp_sent)
    clean_sents.append(temp_sent)

#creating TF-IDF model

from sklearn.feature_extraction.text import TfidfVectorizer as TV
tv = TV()
X = tv.fit_transform(clean_sents).toarray()