def build_guten(): sentences = cPickle.load(open('/scratch/rifaisal/data/guten/guten_subset_idx.pkl')) senna = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl')) gsubset = cPickle.load(open('/scratch/rifaisal/data/guten/guten_vocab_subset.pkl')).flatten().tolist() senna = numpy.array(senna)[gsubset].tolist() hashtab = dict( zip( senna, range( len( gsubset)))) vectorizer = tfidf(vocabulary=hashtab,stop_words='english') wsentences = [] avglen = 0 for s in sentences: print '*', news = '' for w in s: print '.', news +=' ' + senna[w] avglen += len(s) wsentences.append(news) print '' print 'Average sentence size:',avglen/float(len(sentences)) tfidfmat = vectorizer.fit_transform(wsentences) numpy.save('/scratch/rifaisal/data/guten/guten_tfidf.npy',tfidfmat) cPickle.dump(vectorizer,open('gutentokenizer.pkl','w')) print 'Done!'
def main(): traindata = (p.read_table('train.tsv')) tr_title, tr_body, tr_url = convert_text(traindata) testdata = list(np.array(p.read_table('test.tsv'))[:, 2]) y = np.array(p.read_table('train.tsv'))[:, -1] wordCount = cv(stop_words='english', encoding='latin-1') wordTFIDF = tfidf(stop_words='english', encoding='latin-1') corpus = tr_body bag = wordCount.fit_transform(corpus) tfdif = wordTFIDF.fit_transform(corpus) tfdif = tfdif.toarray() kmeans_soln.getDender(bag, tr_title) titles = np.array(tr_title) vocab = wordCount.get_feature_names() vocabTF = wordTFIDF.get_feature_names() topWords(centers, vocab)
def build_guten(): sentences = cPickle.load( open('/scratch/rifaisal/data/guten/guten_subset_idx.pkl')) senna = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl')) gsubset = cPickle.load( open('/scratch/rifaisal/data/guten/guten_vocab_subset.pkl')).flatten( ).tolist() senna = numpy.array(senna)[gsubset].tolist() hashtab = dict(zip(senna, range(len(gsubset)))) vectorizer = tfidf(vocabulary=hashtab, stop_words='english') wsentences = [] avglen = 0 for s in sentences: print '*', news = '' for w in s: print '.', news += ' ' + senna[w] avglen += len(s) wsentences.append(news) print '' print 'Average sentence size:', avglen / float(len(sentences)) tfidfmat = vectorizer.fit_transform(wsentences) numpy.save('/scratch/rifaisal/data/guten/guten_tfidf.npy', tfidfmat) cPickle.dump(vectorizer, open('gutentokenizer.pkl', 'w')) print 'Done!'
def tfidfVectorize(texts, genKey, vocabulary=None, stop_words=None, min_df=1, max_df=1.0, ngram_range=(1, 1), max_features=None): '''This will likely require fixing so that I can pass some of the parameters into this function and keep the remaing functions unused - i.e. to have Vectorize(...vocabulary=someVocab,ngram_range=(1,3)) and Vectorize(...stopwords=someStops,max_features=1000) see options here: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html NOTE: Check by using an empty vocabulary first. This should break. Others are default values and should be okay. ''' print 'vectorizing ', str(len(texts)), ' texts' vectorizer = tfidf(texts.values(), stop_words=stop_words, vocabulary=vocabulary, min_df=min_df, ngram_range=ngram_range, max_features=max_features) vec = vectorizer.fit_transform(texts.values()) labels = [] for k in texts.keys(): labels.append(genKey[k]) labels = np.asarray(labels) return vec, labels, vectorizer
def search(term,desc): term_l=term.lower() desc_l=desc.lower() word_tok=tfidf().build_tokenizer() desc=word_tok(desc_l) terms=word_tok(term_l) return all([x in desc for x in terms])
def main( questionsFile, articleFile ): questions, oQuestions = read_file( questionsFile, True ) if not questions: return sentences, oSentences = read_file( articleFile , False ) transformation = tfidf( analyzer = 'word', ngram_range = (1,1) , token_pattern = r'[^ ]+', min_df = 1 , norm = "l1", use_idf = True, smooth_idf = True, sublinear_tf = False , stop_words = "english") XSentences = transformation.fit_transform( sentences ) XQuestions = transformation.transform( questions ) y = np.arange( XSentences.shape[0] ) model = KNN( n_neighbors=1, metric = "euclidean") model.fit( XSentences, y ) yQuestions = model.predict( XQuestions ) #print yQuestions for iQuestion, iAnswer in enumerate( yQuestions ) : #print iQuestion, iAnswer #print oQuestions[iQuestion] print oSentences[iAnswer] #print pass #print sentences[ i ] pass
def get_XTrain_XTest ( text, y, cv = 5): #This function returns a matrix split in Train and Test data. #This line initializes the class that will change the list of texts to matrices. It #applies logarithm to the counts and considers only the features that have a #minimum count of 2 modelT = tfidf( analyzer = 'word', ngram_range = (1,1) , token_pattern = r'[^ ]+', min_df = 2 , norm = None, use_idf = False, smooth_idf = False, sublinear_tf = True ) indices_Train, indices_Test = list(), list() XTrain, XTest = list(), list() cvI = StratifiedKFold(y, cv, indices= True) for train, test in cvI: indices_Train.append( train ) indices_Test.append ( test ) textTrain = [ text[i] for i in train ] textTest = [ text[i] for i in test ] modelC = clone( modelT ) modelC.fit( textTrain ) XTrain.append( modelC.transform( textTrain ) ) XTest.append ( modelC.transform( textTest ) ) return XTrain, XTest, indices_Train, indices_Test
def search(term, desc): term_l = term.lower() desc_l = desc.lower() word_tok = tfidf().build_tokenizer() desc = word_tok(desc_l) terms = word_tok(term_l) return all([x in desc for x in terms])
def main(): traindata = (p.read_table('train.tsv')) tr_title, tr_body, tr_url = convert_text(traindata) testdata = list(np.array(p.read_table('test.tsv'))[:,2]) y = np.array(p.read_table('train.tsv'))[:,-1] wordCount = cv(stop_words = 'english', encoding='latin-1') wordTFIDF = tfidf(stop_words = 'english', encoding='latin-1') corpus = tr_body bag = wordCount.fit_transform(corpus) tfdif = wordTFIDF.fit_transform(corpus) tfdif = tfdif.toarray() kmeans_soln.getDender(bag, tr_title) titles = np.array(tr_title) vocab = wordCount.get_feature_names() vocabTF = wordTFIDF.get_feature_names() topWords(centers, vocab)
def bag_of_words( clean_train_reviews ): # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = tfidf(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings. train_data_features = vectorizer.fit_transform(clean_train_reviews) # Numpy arrays are easy to work with, so convert the result to an # array train_data_features = train_data_features.toarray() # Take a look at the words in the vocabulary vocab = vectorizer.get_feature_names() # Sum up the counts of each vocabulary word dist = np.sum(train_data_features, axis=0) # For each, print the vocabulary word and the number of times it # appears in the training set # for tag, count in zip(vocab, dist): # print count, tag return train_data_features
def search(query): key_terms=searcher(query) termlist=[] # print(key_terms) if not key_terms: # print("here") key_terms=tfidf().build_tokenizer()(query) # print(key_terms) ranked={} for term in key_terms: if term not in termlist: ranks,authrs=topic_z_scorer(term) if not ranked: ranked=authrs else: ranked=combine_z(ranked,authrs) termlist.append(term) # print(ranked,authrs) for term in key_terms: sub_terms=tfidf().build_tokenizer()(query) for sub_term in sub_terms: if sub_term not in termlist: ranks,authrs=topic_z_scorer(sub_term) authrs={auth:authrs[auth]*0.75 for auth in authrs} if not ranked: ranked=authrs else: ranked=combine_z(ranked,authrs) termlist.append(sub_term) for term in tfidf().build_tokenizer()(query): for word in tfidf().build_tokenizer()(term): if word not in termlist: ranks,authrs=topic_z_scorer(word) authrs={auth:authrs[auth]*1.25 for auth in authrs} if not ranked: ranked=authrs else: ranked=combine_z(ranked,authrs) termlist.append(word) ranks_final=sorted([(ranked[key],key) for key in ranked.keys()]) ranks_final.reverse() # print(ranks_final) return [name[1] for name in ranks_final]
def search(query): key_terms = searcher(query) termlist = [] # print(key_terms) if not key_terms: # print("here") key_terms = tfidf().build_tokenizer()(query) # print(key_terms) ranked = {} for term in key_terms: if term not in termlist: ranks, authrs = topic_z_scorer(term) if not ranked: ranked = authrs else: ranked = combine_z(ranked, authrs) termlist.append(term) # print(ranked,authrs) for term in key_terms: sub_terms = tfidf().build_tokenizer()(query) for sub_term in sub_terms: if sub_term not in termlist: ranks, authrs = topic_z_scorer(sub_term) authrs = {auth: authrs[auth] * 0.75 for auth in authrs} if not ranked: ranked = authrs else: ranked = combine_z(ranked, authrs) termlist.append(sub_term) for term in tfidf().build_tokenizer()(query): for word in tfidf().build_tokenizer()(term): if word not in termlist: ranks, authrs = topic_z_scorer(word) authrs = {auth: authrs[auth] * 1.25 for auth in authrs} if not ranked: ranked = authrs else: ranked = combine_z(ranked, authrs) termlist.append(word) ranks_final = sorted([(ranked[key], key) for key in ranked.keys()]) ranks_final.reverse() # print(ranks_final) return [name[1] for name in ranks_final]
def tf(sent): words=tfidf().build_tokenizer()(sent.lower()) tf={} for word in words: if word in tf: tf[word]+=1 else: tf[word]=1 for key in tf.keys(): tf[key]=tf[key]/len(words) return tf
def tf(sent): words = tfidf().build_tokenizer()(sent.lower()) tf = {} for word in words: if word in tf: tf[word] += 1 else: tf[word] = 1 for key in tf.keys(): tf[key] = tf[key] / len(words) return tf
def activate_tf_idf(clean_text, tags, top_n, model="All", n_estimator=2000, mindf=0.001, ng=1): print(model) dft = pd.DataFrame({"Data": [txt], "Tag": ['M']}) clean_text_test, tags_test = cleaner(dft) if model == 'None': print("\nActivating tfidf analysis for unsupervised learning:") tf_idf = tfidf(sublinear_tf=True, min_df=mindf, ngram_range=(0, ng)) X = tf_idf.fit_transform(clean_text) dtm = pd.DataFrame(X.toarray()) features = tf_idf.get_feature_names() return dtm, features else: print("\nActivating tfidf analysis:") tf_idf = tfidf(sublinear_tf=True, min_df=mindf, ngram_range=(0, ng)) X = tf_idf.fit_transform(clean_text) dtm = pd.DataFrame(X.toarray()) features = tf_idf.get_feature_names() print("Total number of features are:", len(features)) # phrase_extraction_tfidf(dtm, features, clean_text, tags, top_n, tf_idf=tf_idf) if top_n: indices = np.argsort(tf_idf.idf_)[::-1] features = tf_idf.get_feature_names() top = top_n top_features = [features[i] for i in indices[:top]] print("\nHere are the top {} important features:".format(top)) print(top_features) X_train, X_test, y_train, y_test = train_test_split(dtm, tags, test_size=0.05, random_state=40) X_test = dtm.iloc[0:31, :] y_test = tags[0:31] new_test = tf_idf.transform(clean_text_test) model_runner(model, n_estimator, X_train, X_test, y_train, y_test, new_test)
# In[45]: from sklearn.feature_extraction.text import CountVectorizer as cv from sklearn.feature_extraction.text import TfidfVectorizer as tfidf from sklearn.metrics.pairwise import linear_kernel df = pd.read_pickle('articles.pkl') df.head() # In[104]: wordVector = cv(stop_words = 'english', encoding='latin-1') wordWeights = tfidf(stop_words = 'english', encoding='latin-1') corpus = df[df['section_name'] == 'Sports']['content'] corpus = corpus.append(df[df['section_name'] == 'Arts']['content']) corpus = corpus.append(df[df['section_name'] == 'Business Day']['content']) bag = wordVector.fit_transform(corpus) weightybags = wordWeights.fit_transform(corpus) # In[105]: weightybags = weightybags.toarray() # In[106]:
terms_desc={} for term in terms: med_term=term.string if len(med_term)>1: print(med_term+","+term.next_sibling.next_sibling.string) filer=tf("r+") filer.write(str(term.next_sibling.next_sibling.string)) terms_desc[med_term]=str(term.next_sibling.next_sibling.string) filer.seek(0) files.append(filer) for filer in files: print(filer.read()) filer.seek(0) from sklearn.feature_extraction.text import TfidfVectorizer as tfidf learnt=tfidf("file") transd=learnt.fit_transform(files) print(transd) import pickle with open("learnt-tf.pck","w+b") as ltf: pickle.dump(learnt,ltf) with open("terms-desc.pck","w+b") as ted_def: pickle.dump(terms_desc,ted_def)
def get_tfidf(df): df.label = df.label.apply(lambda x: 1 if x == 'lib' \ else 0 if x == 'con' else -1) v = tfidf() return pd.DataFrame(v.fit_transform(df['sent']).toarray())
for term in terms_desc.keys(): if search(tfs_list_sorted[0][1], terms_desc[term]) or search( tfs_list_sorted[1][1], terms_desc[term]): possible_terms[term] = terms_desc[term] if len(possible_terms.keys()) <= term_threshold: return possible_terms.keys() else: possiblity = query_terms(tfs_list_sorted[1:], possible_terms) if not possiblity: print(possiblity) return list(possible_terms.keys()) else: return possiblity import pickle import json def searcher(query): with open("learnt-tf.pck", "r+b") as ltf, open("terms-desc.json", "r") as ted_def: learnt = pickle.load(ltf) terms_desc = json.load(ted_def) return query_terms(tf_tdf_sent(query, learnt)[1], terms_desc) if __name__ == '__main__': print(searcher("My baby is heart")) print(tfidf().build_tokenizer()("My baby is weighing less."))
files = [] terms_desc = {} for term in terms: med_term = term.string if len(med_term) > 1: print(med_term + "," + term.next_sibling.next_sibling.string) filer = tf("r+") filer.write(str(term.next_sibling.next_sibling.string)) terms_desc[med_term] = str(term.next_sibling.next_sibling.string) filer.seek(0) files.append(filer) for filer in files: print(filer.read()) filer.seek(0) from sklearn.feature_extraction.text import TfidfVectorizer as tfidf learnt = tfidf("file") transd = learnt.fit_transform(files) print(transd) import pickle with open("learnt-tf.pck", "w+b") as ltf: pickle.dump(learnt, ltf) with open("terms-desc.pck", "w+b") as ted_def: pickle.dump(terms_desc, ted_def)
if len(tfs_list_sorted)<2: #print("Exhausted") if len(terms_desc)<10: return list(terms_desc.keys()) else: return [] for term in terms_desc.keys(): if search(tfs_list_sorted[0][1],terms_desc[term]) or search(tfs_list_sorted[1][1],terms_desc[term]): possible_terms[term]=terms_desc[term] if len(possible_terms.keys())<=term_threshold: return possible_terms.keys() else: possiblity=query_terms(tfs_list_sorted[1:],possible_terms) if not possiblity: print(possiblity) return list(possible_terms.keys()) else: return possiblity import pickle import json def searcher(query): with open("learnt-tf.pck","r+b") as ltf,open("terms-desc.json","r") as ted_def: learnt=pickle.load(ltf) terms_desc=json.load(ted_def) return query_terms(tf_tdf_sent(query,learnt)[1],terms_desc) if __name__ == '__main__': print(searcher("My baby is heart")) print(tfidf().build_tokenizer()("My baby is weighing less."))
print(" new pred") print(clfnb.predict(new_test.toarray())) print ("Model Accuracy for cross validation - ", np.mean(cross_val_score(clfnb, X_test, y_test, cv=10))) print ("Accuracy of the model on testing data: {}% ".format(accuracy_score(y_test, prediction) * 100)) print ("F1 score: {}".format(f1_score(y_test, prediction, average='micro'))) print("Confusion Matrix: ") print (confusion_matrix(y_test, prediction)) return True if __name__ == "__main__": df = pd.read_csv("C:\\Users\Sheel\PycharmProjects\\NLP\Data_Set.txt", sep="\t", encoding='latin-1', header=None) dsi = {"URL": df[0].tolist(), "Tag": df[1].tolist(), "Data": df[2].tolist()} dsi_df = pd.DataFrame(dsi) clean_text, tags= cleaner(dsi_df) cv = tfidf(sublinear_tf=True, min_df=0.05) X = cv.fit_transform(clean_text) dtm = pd.DataFrame(X.toarray(), columns=cv.get_feature_names()) X_train=dtm y_train=tags X_test= dtm.iloc[0:31,:] y_test= tags[0:31] txt = """Michel Salama HerszageComcast adquire mais de 75% das ações da Sky Operação movimentou US$ 40 bilhões A Comcast informou que garantiu mais de 75 por cento das ações da Sky, aproximando-se de finalizar a aquisição do grupo britânico de TV paga por 40 bilhões de dólares. A empresa norte-americana de televisão a cabo Comcast disse anteriormente que espera que a aquisição seja concluída até o final de outubro. No mês passado, a Comcast emergiu triunfante na longa batalha pela Sky depois de uma disputa com a Twenty- 8:51 PMMichel Salama HerszageFirst Century Fox, de Rupert Murdoch, em um leilão. A Comcast informou em comunicado nesta quinta-feira que, até 9 de outubro, quando concluir a compra da participação de 39 por cento da Twenty-First Century Fox, já manterá ou terá recebido aceitações em mais de 75 por cento do capital social da Sky. A empresa disse que um novo anúncio será feito no devido tempo. """ dft = pd.DataFrame({"Data": [txt], "Tag": ['M']}) clean_text_test, tags_test = cleaner(dft) new_test = cv.transform(clean_text_test) # print(new_test) run_random_forest_classifier() run_nb_classifier() print (dtm.shape)
def tfidf_maker(df_parole): maker = tfidf() matrix = np.matrix(maker.fit_transform(counter_maker(df_parole)).toarray()) df_tfidf = pd.DataFrame(matrix, index=df_parole['groupe']) return df_tfidf
# use str.replace() to remove any instances of the words for w in [ "sara", "shackleton", "chris", "germani", "sshacklensf", "cgermannsf" ]: text = text.replace(w, '') # append the text to word_data word_data.append(text) # append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == 'sara': from_data.append(0) else: from_data.append(1) email.close() print "emails processed" from_sara.close() from_chris.close() pickle.dump(word_data, open("your_word_data.pkl", "w")) pickle.dump(from_data, open("your_email_authors.pkl", "w")) print word_data[152] # in Part 4, do TfIdf vectorization here from sklearn.feature_extraction.text import TfidfVectorizer as tfidf vec = tfidf(stop_words='english') matrix = vec.fit_transform(word_data)
def __init__(self, n_grams=[1, 2, 3], config_identifier='n_gram_123'): super(TfIdfVectorizer, self).__init__(config_identifier=config_identifier) self._vec = tfidf(ngram_range=(min(n_grams), max(n_grams))) self.n_grams = n_grams
if __name__ == '__main__': print("读入训练数据...") TrainIndex, TrainTitle, TrainDescri, TrainWhole, TrainCount = loadDateSet( '../ag_news_csv/train.csv') print("读入测试数据...") TestIndexTarget, TestTitle, TestDescri, TestWhole, TestCount = loadDateSet( '../ag_news_csv/test.csv') print("生成TF-IDF词向量空间...") Title = TrainTitle + TestTitle # --------------------------------------------------------------title合集 Descri = TrainDescri + TestDescri # -----------------------------------------------------------descri合集 Whole = TrainWhole + TestWhole # --------------------------------------------------------------Whole合集 cvector = CountVectorizer( stop_words='english', min_df=2, max_features=100) # -----------------------------------特征提取器,避开英文停用词 transformer = tfidf( ) # -----------------------------------------------------------------------计算tfidf特征 temp = cvector.fit_transform( Title ) # ------------------------------------------------------每一行是一篇文章的词向量 # temp = transformer.fit_transform(cvector.fit_transform(Title))#------------------------------每一行是一篇文章的词向量 TrainTitle_bag = temp[0:TrainCount] TestTitle_bag = temp[TrainCount:temp.shape[0]] temp = cvector.fit_transform( Descri ) # ------------------------------------------------------每一行是一篇文章的词向量 # temp = transformer.fit_transform(cvector.fit_transform(Descri))#-----------------------------每一行是一篇文章的词向量 TrainDescri_bag = temp[0:TrainCount] TestDescri_bag = temp[TrainCount:temp.shape[0]] temp = cvector.fit_transform( Whole ) # ------------------------------------------------------每一行是一篇文章的词向量
#boxplot of absolute frequencies plt.boxplot(presence) plt.show() #plot of absolute frequencies plt.plot(xrange(len(presence)), sorted(presence)) plt.show() from sklearn.metrics.pairwise import pairwise_distances distances = pairwise_distances(data, metric='euclidean') plt.imshow(distances); plt.title('Euclidean Similarity of initial data') plt.colorbar() distances = pairwise_distances(data, metric='cosine') plt.figure(4) plt.imshow(distances); plt.title('Cosine Similarity of initial data') plt.colorbar() plt.show() """ Processing data """ from sklearn.feature_extraction.text import TfidfTransformer as tfidf transformer = tfidf() data = transformer.fit_transform(data) distances = pairwise_distances(data, metric='cosine') plt.figure(6) plt.imshow(distances); plt.title('Cosine Similarity of tfidf data') plt.colorbar() plt.show()
# In[3]: np.argwhere(np.isnan(embed)) # In[4]: document = pd.concat((train['comment_text'],test['comment_text'])) document.fillna('') tfidf_1gram = tfidf(stop_words="english", ngram_range=(1,4), max_features=50000, sublinear_tf=True, strip_accents="unicode", min_df=3, max_df=0.9) #tfidf_2gram = tfidf(stop_words="english", ngram_range=(2,4), max_features=20000, sublinear_tf=True, strip_accents="unicode", min_df=3) #tfidf_chargram = tfidf(encoding='unicode', analyzer='char', ngram_range=(2,6), sublinear_tf=True, max_features=40000) tfidf_1gram = tfidf_1gram.fit(document) #tfidf_2gram = tfidf_2gram.fit(document) #tfidf_chargram = tfidf_chargram.fit(document) train_f= pd.read_csv("train_f.csv") test_f = pd.read_csv("test_f.csv") train_tfidf = tfidf_1gram.transform(train['comment_text']) test_tfidf = tfidf_1gram.transform(test['comment_text']) # In[10]:
findElbow(x_iris) # In[45]: from sklearn.feature_extraction.text import CountVectorizer as cv from sklearn.feature_extraction.text import TfidfVectorizer as tfidf from sklearn.metrics.pairwise import linear_kernel df = pd.read_pickle('articles.pkl') df.head() # In[104]: wordVector = cv(stop_words='english', encoding='latin-1') wordWeights = tfidf(stop_words='english', encoding='latin-1') corpus = df[df['section_name'] == 'Sports']['content'] corpus = corpus.append(df[df['section_name'] == 'Arts']['content']) corpus = corpus.append(df[df['section_name'] == 'Business Day']['content']) bag = wordVector.fit_transform(corpus) weightybags = wordWeights.fit_transform(corpus) # In[105]: weightybags = weightybags.toarray() # In[106]: bag = bag.toarray()
porter = PorterStemmer() wordnet = WordNetLemmatizer() d = [] data_stem_lem = [] for doc in data_st: for word in doc: """word = porter.stem(word) """ word = wordnet.lemmatize(word) d.append(word) data_stem_lem.append(d) d = [] # reverse of tokenisation for each document X = [(" ").join(doc) for doc in data_stem_lem] from sklearn.feature_extraction.text import TfidfVectorizer as tfidf T = tfidf() # define tfidf object T.fit(X) # fit for your data for td * idf value data_TFIDF = T.transform( X) # transform to tf*idf array this will be a sparse matrix #featuresNames = T.get_feature_names() get all the feature names # Sparsity reduction f = [] for i in range(data_TFIDF.shape[1]): # removing 99% sparsity column (here removing 99% zero value columns i.e. 1% non zero) if (data_TFIDF[:, i].count_nonzero() / data_TFIDF.shape[0]) > 0.01: f.append(i) #Final data X = data_TFIDF[:, f]
""" from sklearn.svm import LinearSVC from sklearn.linear_model import LogisticRegression from sklearn.dummy import DummyClassifier from sklearn.pipeline import Pipeline from sklearn.pipeline import FeatureUnion from sklearn.feature_extraction.text import TfidfVectorizer as tfidf from sklearn.feature_extraction.text import CountVectorizer as vectorizer ngram = Pipeline([('features', FeatureUnion([('wrd', tfidf(binary=False, max_df=1.0, min_df=2, norm='l2', sublinear_tf=True, use_idf=True, lowercase=True)), ('char', tfidf(analyzer='char', ngram_range=(3, 6), binary=False, max_df=1.0, min_df=2, norm='l2', sublinear_tf=True, use_idf=True, lowercase=True))])), ('clf', LinearSVC())]) words = Pipeline([('features',
import networkx as nx headerFilename="header.csv" file=open(headerFilename) headers=file.readlines()[0].split('\r') filename="centroids.csv" data=np.genfromtxt(filename,delimiter=" ") print(data.shape) a=tfidf() tData=a.fit_transform(data) tData=tData.toarray() plt.figure() plt.subplot(2,1,1) plt.imshow(data,vmin=0,vmax=1) plt.title("original centroids") plt.colorbar() plt.subplot(2,1,2) plt.imshow(tData,vmin=0,vmax=1) plt.colorbar() plt.title("tf-idf") # tData=tData-np.mean(tData,0) # plt.figure()
test.append(t.tokenize(rm[i], wakati=True)) for i in range(len(train)): sumt.append(train[i]) for i in range(len(test)): sumt.append(test[i]) #分かち書き x_doc = [] for i in range(len(sumt)): doc = "" for j in range(len(sumt[i])): doc = doc + " " + sumt[i][j] x_doc.append(doc) #TF-IDF tivec = tfidf() x = tivec.fit_transform(x_doc) x = x.toarray() #認識された文書と脚本を分割 x_train = x[0:int(len(x) / 2)] x_test = x[int(len(x) / 2):len(x)] out = [] #脚本と入れ替え for i in range(len(x_train)): num = 0 sim = np.dot(x_train[i], x_test[0]) / (np.linalg.norm(x_train[i]) * np.linalg.norm(x_test[0])) for j in range(len(x_test)): sim2 = np.dot(x_train[i], x_test[j]) / (np.linalg.norm(x_train[i]) *
"cgermannsf" ] for word in stopwords: text = text.replace(word, "") ### append the text to word_data word_data.append(text) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == "sara": from_data.append(0) else: from_data.append(1) email.close() print("emails processed") print("word data:", word_data[152]) from_sara.close() from_chris.close() pickle.dump(word_data, open("your_word_data.pkl", "wb")) pickle.dump(from_data, open("your_email_authors.pkl", "wb")) ### in Part 4, do TfIdf vectorization here from sklearn.feature_extraction.text import TfidfVectorizer as tfidf vectorizer = tfidf(stop_words='english') transformed = vectorizer.fit_transform(word_data) print(len(vectorizer.get_feature_names())) print(vectorizer.get_feature_names()[34597])
charged protons from one another. Under certain circumstances, the repelling electromagnetic force becomes stronger than the nuclear force. In this case, the nucleus splits and leaves behind different elements. This is a form of nuclear decay.''' #dependencies import re #regular expression from nltk.tokenize import sent_tokenize as st, word_tokenize as wt #for tokenization from nltk.corpus import stopwords #stop words from nltk.stem import WordNetLemmatizer as wl #for lemmatization wordnet = wl() #object creation for lemmatization corpus = [] #empty list sentences = st(para) #tokenizing the paragraph to sentences for i in range(len(sentences)): rev = re.sub( '[^a-zA-Z]', ' ', sentences[i]) #replace all the letters by space except the alphabets rev = rev.lower() #lower the senteces rev = rev.split() #each word gets converted to an element of a list rev = [ wordnet.lemmatize(word) for word in rev if word not in stopwords.words('english') ] rev = ' '.join(rev) corpus.append(rev) #creating TF-IDF model from sklearn.feature_extraction.text import TfidfVectorizer as tfidf cv = tfidf() #object creation x = cv.fit_transform(corpus).toarray() #transforming