def main(): # create tweets dataframe tweets = tfidf.build_corpus_from_csv(dataFile) # create just a list of tweets tweets_only = [tweet for tweet in tweets['Tweet']] # define stopset stopset = set(stopwords.words('english')) # tokenize the tweets in place tweets['Tweet'] = tfidf.tokenize_corpus(tweets['Tweet'], stopset) # print the 10 most frequent words for each tweet get_most_frequent_words(tweets, 10) ############################## # create vectorizer vectorizer = TfidfVectorizer(input='content', stop_words=stopset) # fit the vectorizer vectorizer.fit_transform(tweets_only) # get feature names tweet_features = vectorizer.get_feature_names() # Generate frequency distrubutions for each tweet freqs = [] indices = [] for (num, entry) in tweets.iterrows(): freqs.append(FreqDist(entry['Tweet'])) indices.append(num) # loop over the features, and insert frequences in the dataframe for feature in tweet_features: tweets[feature] = pd.Series( [fd[feature] for fd in freqs], index=indices ) # output a csv tweets.to_csv('frequencies.csv')
class Q3Transformer(base.BaseEstimator, base.TransformerMixin): ''' class variable: self.col; self.vectorizer ''' def __init__(self): self.col = 'categories' # initialize the column name def fit(self, X, y=None): # pick the column pick_category = pick(self.col, X) category_train = [' '.join(pick_category[i].values()[0]) for i in range(0,len(pick_category))] # transform the training records self.vectorizer = TfidfVectorizer(min_df=1) self.vectorizer.fit_transform(category_train) return self def transform(self, X): # transform the test record if type(X) is list: pick_category = pick(self.col, X) category_X = [' '.join(pick_category[i].values()[0]) for i in range(0,len(pick_category))] else: category_X = [' '.join(X[self.col])] X_trans = self.vectorizer.transform(category_X) return X_trans
def getNewsContext(newsObj,ent_ind,ents,vocab,window): ent_text = {} for e in ent_ind: ent_text[e] = '' sentencesIn = [] sentencesInObj= [] entsIn = [] # binary matrix indices = [] indptr = [0] for news in newsObj: h_ent = news.h_ent s = makeEntText(h_ent,ent_text,ent_ind,indices,indptr,window) if s: sentencesIn.append( s ) sentencesInObj.append(Sentence(s,news.created_at,h_ent,news.title)) b_ent = news.b_ent for sentence in sent_detector.tokenize(b_ent.strip()): s = makeEntText(sentence,ent_text,ent_ind,indices,indptr,window) if s: sentencesIn.append( s ) sentencesInObj.append(Sentence(s,news.created_at,sentence,news.title)) newsVectorizer = TfidfVectorizer(stop_words='english',vocabulary=vocab,#use_idf=False, tokenizer=lambda text: news_tokenizer(text,'reg')) XN = newsVectorizer.fit_transform(sentencesIn) # for e in ents: entsIn.append(ent_text[e]) XEn = newsVectorizer.fit_transform(entsIn) NEb = csr_matrix((np.ones(len(indices)), indices, indptr), shape=(len(sentencesIn),len(ents) )) return XN,XEn,NEb,sentencesIn,sentencesInObj,ent_text
def getTweetContext(tweetsObj,ent_ind,ents,vocab,window): ent_text = {} for e in ent_ind: ent_text[e] = '' t0 = time() tweetsIn = [] tweetsInObj = [] entsIn = [] indices = [] indptr = [0] for i in tweetsObj: tweet = tweetsObj[i] tokens_ent = tweet.tokens_ent t = makeEntText(tokens_ent,ent_text,ent_ind,indices,indptr,window) if t: tweetsIn.append( t ) tweetsInObj.append( tweet ) print( "append in "+str(time() - t0)) t0 = time() tweetVectorizer = TfidfVectorizer(stop_words='english',vocabulary=vocab,#use_idf=False, tokenizer=lambda text: tweet_tokenizer(text,'reg')) XT = tweetVectorizer.fit_transform(tweetsIn) print( "vectorize in "+str(time() - t0)) t0 = time() for e in ents: entsIn.append(ent_text[e]) XEt = tweetVectorizer.fit_transform(entsIn) print( "ents append + vec in "+str(time() - t0)) TEb = csr_matrix((np.ones(len(indices)), indices, indptr), shape=(len(tweetsIn),len(ents) )) return XT,XEt,TEb,tweetsIn,tweetsInObj,ent_text
def get_bow_vect_data_test(classif_data): vect = TfidfVectorizer() vect.fit_transform([classif_data["corpus"]]) #Before we begin, get rid of any test articles with no topic vect_token_sets = [] vect_test_sets = [] #Transform testing and training data for i in classif_data["train_tokens"]: vect_token_sets.append(vect.transform([i]).toarray()) for i in classif_data["test_tokens"]: vect_test_sets.append(vect.transform([i]).toarray()) train_set = [] test_set = [] for i in vect_token_sets: train_set.append(i[0]) for i in vect_test_sets: test_set.append(i[0]) return { "vectorizer": vect, "train_vect": train_set, "test_vect": test_set }
class Classifier(object): def __init__(self): self.classifier = LogisticRegression(intercept_scaling=100) self.vectorizer = TfidfVectorizer() def trainvectorizer(self,corpus): self.vectorizer.fit_transform(corpus) file1 = open("feature_names.txt","w") names = self.vectorizer.get_feature_names() print len(names) for name in names: file1.write(name.encode('utf8')+"\n") file1.close() print "vectrizer train is over...." def trainclassifier(self,train_X,train_Y): self.classifier.fit(train_X,train_Y) print "classifier train is over ...." def getfeature(self,text):#return a feature array matrx = self.vectorizer.transform([text]).toarray() array = matrx[0] return array def getresult(self,feature):#return true or false return self.classifier.predict(feature)
def tfidf_score(train_set, test_set): stopwords = nltk.corpus.stopwords.words('english') vectorizer = TfidfVectorizer(min_df=1, stop_words=set(stopwords)) #Remove all the None Types from the input datasets train_set = filter(None, train_set) test_set = filter(None, test_set) vectorizer.fit_transform(train_set) #print "Word Index is {0} \n".format(vectorizer.vocabulary_) smatrix = vectorizer.transform(test_set) tfidf = TfidfTransformer(norm="l2") tfidf.fit(smatrix) #print "IDF scores:", tfidf.idf_ tf_idf_matrix = tfidf.transform(smatrix) pairwise_similarity = tf_idf_matrix * tf_idf_matrix.T msum = tf_idf_matrix.sum(axis=1) cos_sum = pairwise_similarity.sum(axis=1) mlist = msum.tolist() cos_sim = cos_sum.tolist() count = 0 tfidfscores = {} for s in train_set: tfidfscores[s] = [] tfidfscores[s].append(mlist[count][0]) tfidfscores[s].append(cos_sim[count][0]) count += 1 return tfidfscores
def readFile(filename): global vectorizer train_data = pd.read_csv(filename, header=0, delimiter='\t', quoting=3) train_size = train_data.shape[0] clean_train = [] for i in xrange(0,train_size): clean_train.append(filter(train_data['review'][i])) #if i%1000 ==0: # print '%d reviews processed...' %i #vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000) if vectorizer==None: vectorizer = TfidfVectorizer(sublinear_tf=True,max_df=0.5, max_features = 50000) train_data_feature = vectorizer.fit_transform(clean_train) else: vec = TfidfVectorizer(vocabulary=vectorizer.vocabulary_) train_data_feature = vec.fit_transform(clean_train) print train_data_feature.shape if 'test' in filename: return train_data['id'], train_data_feature else: return train_data['id'], train_data_feature, train_data['sentiment']
def createTDIDF(): ## Bag of words with open("./data/movies.csv") as f: train_set1 = [line.lower().rstrip() for line in f] with open("./data/dvd.csv") as f: train_set2 = [line.lower().rstrip() for line in f] train_set = sorted(list(set(train_set1 + train_set2))) # Create dictionary to find movie dictTrain = dict() for i,movie in enumerate(train_set): dictTrain[movie] = i # Find weitghts tfidf_vectorizer = TfidfVectorizer() tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set) ## Tri-grams lenGram = 3 train_setBigrams = [] for mov in train_set: temp = [mov[i:i+lenGram] for i in range(len(mov)-1)] temp = [elem for elem in temp if len(elem) == lenGram] train_setBigrams.append(' '.join(temp)) train_setBigrams = sorted(list(set(train_setBigrams))) dictTrainBigrams = dict() for i,movie in enumerate(train_setBigrams): dictTrainBigrams[movie] = i tfidf_vectorizerBigrams = TfidfVectorizer() tfidf_matrix_trainBigrams = tfidf_vectorizerBigrams.fit_transform(train_setBigrams) return [tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram]
def get_IDF_topn_words(data=[], n=3, vocabulary=None): vect = TfidfVectorizer(vocabulary=vocabulary) vect.fit_transform(data) indices = np.argsort(vect.idf_)[::-1] # idf_ and tfidf could also be used features = vect.get_feature_names() top_features = [features[i] for i in indices[:n]] return top_features
def classify_svm(text): coarse_X = sets['coarse_training_qs'] coarse_Y = sets['coarse_training_targets'] fine_X = sets['fine_training_qs'] fine_Y = sets['fine_training_targets'] vectz = TfidfVectorizer(min_df=2, decode_error="ignore") coarse_X = vectz.fit_transform(coarse_X) fine_X = vectz.fit_transform(fine_X) array_to_classify = vectz.transform([text]).toarray() # coarse svm_coarse = SVC(C=1000, gamma = 0.001, kernel='rbf') svm_coarse.fit(coarse_X, coarse_Y) # predict coarse_predict = svm_coarse.predict(array_to_classify) # fine svm_fine = SVC(C=1000, gamma = 0.001, kernel='rbf') svm_fine.fit(fine_X, fine_Y) # predict fine_predict = svm_fine.predict(array_to_classify) results={} results['coarse_class'] = coarse_predict[0] results['fine_class'] = fine_predict[0] return results
def doTFIDF(train, test1, test2): steemedTrain = stemIt(train) steemedTest1 = stemIt(test1) steemedTest2 = stemIt(test2) print "done stemming tweets" regTrain = processIt(train) regTest1 = processIt(test1) regTest2 = processIt(test2) vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=1) X = vectorizer.fit_transform(regTrain) Xtest1 = vectorizer.transform(regTest1) Xtest2 = vectorizer.transform(regTest2) scipy.io.mmwrite('train_reg_dataM',X, field='real') scipy.io.mmwrite('test1_reg_dataM',Xtest1, field='real') scipy.io.mmwrite('test2_reg_dataM',Xtest2, field='real') vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=1) X = vectorizer.fit_transform(steemedTrain) Xtest1 = vectorizer.transform(steemedTest1) Xtest2 = vectorizer.transform(steemedTest2) scipy.io.mmwrite('train_stem_dataM',X, field='real') scipy.io.mmwrite('test1_stem_dataM',Xtest1, field='real') scipy.io.mmwrite('test2_stem_dataM',Xtest2, field='real')
def get_features(vocab): vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2') X_train_head = vectorizer_head.fit_transform(headlines) vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2') X_train_body = vectorizer_body.fit_transform(bodies) # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The # more important topic words a body contains of a certain topic, the higher its value for this topic lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3) print("latent_dirichlet_allocation_cos: fit and transform body") t0 = time() lda_body_matrix = lda_body.fit_transform(X_train_body) print("done in %0.3fs." % (time() - t0)) print("latent_dirichlet_allocation_cos: transform head") # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics # their vectors should be similar lda_head_matrix = lda_body.transform(X_train_head) #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100) print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body') # calculate cosine distance between the body and head X = [] for i in range(len(lda_head_matrix)): X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1)) cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten() X.append(cos_dist.tolist()) return X
def get_samples_predictions(all_words, percent): def tokenizer(string): stemmer = snowball.EnglishStemmer(ignore_stopwords=True) regex = re.compile('\w\w+') return tuple(stemmer.stem(w) for w in regex.findall(string)) vectorizer = TfidfVectorizer( input='filename', tokenizer=tokenizer, ngram_range=(1, 3), stop_words=stopwords.words(), max_df=0.95, # ignore words with a term frequency higher than 95% (corpus specific stopwords) vocabulary=all_words, use_idf=True, # use inverse-document-frequency reweighting sublinear_tf=True # tf is 1-log(tf) ) sample_fids, predictions = list(), list() for category in CATEGORIES: for fid in corpus.fileids(categories=category): sample_fids.append(os.path.join(FID_DIRECTORY, fid)) predictions.append(CATEGORIES.index(category)) shuffle_list(sample_fids, seed=123) shuffle_list(predictions, seed=123) training_fids, test_fids = split_list(sample_fids, percent=percent) training_samples = vectorizer.fit_transform(training_fids) test_samples = vectorizer.fit_transform(test_fids) training_predictions, test_predictions = split_list(predictions, percent=percent) return training_samples, training_predictions, test_samples, test_predictions
def feature_tfidf(train_lines, test_lines, train_text_index, test_text_index): start = time.time() train_text_arr, forward_train, comment_train, like_train = file_to_arr(train_lines, train_text_index, 'train') test_text_arr = file_to_arr(test_lines, test_text_index, 'test') end = time.time() print 'train and test file to array fininshed with: ' + str(end - start) start = time.time() # debug start # train_text_arr_nozero = [] # comment_train_nozero = [] # for i in range(len(comment_train)): # if int(comment_train[i]) != 0: # train_text_arr_nozero.append(train_text_arr[i]) # comment_train_nozero.append(comment_train[i]) # train_text_arr = train_text_arr_nozero # comment_train = comment_train_nozero # debug end tv = TfidfVectorizer(sublinear_tf=True, max_df=0.5) tfidf_train = tv.fit_transform(train_text_arr) tv2 = TfidfVectorizer(vocabulary=tv.vocabulary_) tfidf_test = tv2.fit_transform(test_text_arr) end = time.time() print 'train and test array to tfidf feature fininshed with: ' + str(end - start) return tfidf_train, tfidf_test, forward_train, comment_train, like_train
class text_similarity(): def __init__(self, booklist): self.booklist = booklist self.alltext = '' self.merged = pd.DataFrame(columns = ['Book', 'Chapter', 'Verse', 'Original Text', 'Formatted Text']) self.booklistnames = [] for i, book in enumerate(booklist): self.alltext += book.all_text self.merged = pd.merge(self.merged, book.df, how = 'outer') self.booklistnames = np.append(self.booklistnames, book.unique_books) self.vocabulary = self.alltext.split() self.vocabulary = [word for word in self.vocabulary if len(word) > 2] self.cosinedf = pd.DataFrame(columns=self.booklistnames, index=self.booklistnames) self.merged['Source'] = '' sources = [] for i, book in enumerate(self.merged['Book']): for books in self.booklist: if book in books.unique_books: sources.append(books.name) self.merged['Source'] = sources self.vect = TfidfVectorizer(stop_words='english') self.vect.fit_transform(self.vocabulary) self.vectorize() self.cosine() def vectorize(self): self.tfidf_df = pd.DataFrame(columns= ['Book', 'Vector']) self.tfidf_df['Book'] = self.booklistnames for i, book in enumerate(self.booklistnames): joined = " ".join(self.merged[self.merged['Book'] == book]['Formatted Text'].values) self.tfidf_df.iloc[i, 1] = self.vect.transform([joined]) vectors = [] for i, line in enumerate(self.merged['Formatted Text']): vectors.append(self.vect.transform([line])) if i % 10 == 0: print i self.merged['Vectors'] = vectors print "vectorization complete" def cosine(self): self.cosinedf['Source'] = '' sources = [] for i, book in enumerate(self.cosinedf): for books in self.booklist: if book in books.unique_books: sources.append(books.name) self.cosinedf['Source'] = sources for i, book1 in enumerate(self.booklistnames): for j, book2 in enumerate(self.booklistnames): if book1 == book2: self.cosinedf[book1][book2] = 1. elif i<j: self.cosinedf[book1][book2] = cosine_similarity(self.tfidf_df[self.tfidf_df['Book'] == book1]['Vector'].values[0], self.tfidf_df[self.tfidf_df['Book'] == book2]['Vector'].values[0])[0][0] print "cosine similarity complete"
def create_tf_idf_sim_matrix( title_rev_log, desc_rev_log, cr_area_top_level, title_file_name): #print "Title- rev", title_rev_log #print "Desc-rev", desc_rev_log #print "cr_area_top_level", cr_area_top_level #print "title_file_name", title_file_name # tfidf_vectorizer = TfidfVectorizer(stop_words='english',decode_error='ignore') tfidf_vectorizer = TfidfVectorizer(decode_error='ignore') title_rev_log_tfidf_matrix = tfidf_vectorizer.fit_transform(title_rev_log) desc_rev_log_tfidf_matrix = tfidf_vectorizer.fit_transform(desc_rev_log) cr_area_top_level_tfidf_matrix = tfidf_vectorizer.fit_transform(cr_area_top_level) title_file_name_tfidf_matrix = tfidf_vectorizer.fit_transform(title_file_name) #print "size=", title_rev_log_tfidf_matrix.shape, desc_rev_log_tfidf_matrix.shape, cr_area_top_level_tfidf_matrix.shape, title_file_name_tfidf_matrix.shape #print "Title Rev Log=", title_rev_log_tfidf_matrix #print "Desc rev log = ", desc_rev_log_tfidf_matrix #print "cr area top level=", cr_area_top_level_tfidf_matrix #print "title file name=", title_file_name_tfidf_matrix title_rev_log_sim_matrix = cosine_similarity(title_rev_log_tfidf_matrix[0:1], title_rev_log_tfidf_matrix) desc_rev_log_sim_matrix = cosine_similarity(desc_rev_log_tfidf_matrix[0:1], desc_rev_log_tfidf_matrix) cr_area_top_level_sim_matrix = cosine_similarity(cr_area_top_level_tfidf_matrix[0:1], cr_area_top_level_tfidf_matrix) title_file_name_sim_matrix = cosine_similarity( title_file_name_tfidf_matrix[0:1], title_file_name_tfidf_matrix) #print "sim title-rev log", title_rev_log_sim_matrix #print "desc rev log", desc_rev_log_sim_matrix #print "cr area top", cr_area_top_level_sim_matrix #print "title file name", title_file_name_sim_matrix return title_rev_log_sim_matrix, desc_rev_log_sim_matrix, cr_area_top_level_sim_matrix, title_file_name_sim_matrix
def readFile(filename): global vectorizer train_data = pd.read_csv(filename, header=0, delimiter='\t', quoting=3) train_size = train_data.shape[0] clean_train = [] for i in xrange(0,train_size): clean_train.append(filter(train_data['review'][i])) if i%1000 ==0: print '%d reviews processed...' %i from sklearn.feature_extraction.text import TfidfVectorizer if vectorizer==None: vectorizer = TfidfVectorizer(sublinear_tf=True,max_df=0.9,ngram_range=(1,3),max_features=100000) train_data_feature = vectorizer.fit_transform(clean_train) else: vec = TfidfVectorizer(vocabulary=vectorizer.vocabulary_) train_data_feature = vec.fit_transform(clean_train) print train_data_feature.shape if 'test' in filename: return train_data['id'], train_data_feature else: return train_data['id'], train_data_feature, train_data['sentiment']
def classify(good_deals,bad_deals,dictionary): word_with_low_freq = [word for word in dictionary.elements() if dictionary[word]<1] for word in word_with_low_freq: del dictionary[word] tfidf_vectorizer = TfidfVectorizer(vocabulary=dictionary) good_tfidf = tfidf_vectorizer.fit_transform(good_deals) bad_tfidf = tfidf_vectorizer.fit_transform(bad_deals) good_tfidf = good_tfidf.todense() bad_tfidf = bad_tfidf.todense() svm_data = [] svm_data.append(good_tfidf) svm_data.append(bad_tfidf) svm_data = np.concatenate(svm_data) svm_pos_lables = np.ones(len(good_tfidf)) svm_neg_lables = np.zeros(len(bad_tfidf)) labels= [] labels.append(svm_pos_lables) labels.append(svm_neg_lables) svm_labels = np.concatenate(labels) param_grid = [ {'C': [1, 10, 100, 1000], 'gamma': [1,0.1,0.001, 0.0001],'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [1,0.1,0.001, 0.0001], 'kernel': ['rbf']}, ] svc = svm.SVC() clf = grid_search.GridSearchCV(estimator=svc, param_grid=param_grid,n_jobs=1) print "Training SVM classifier for grid of C and gamma values to select best parameter\n" clf.fit(svm_data,svm_labels) print "svm score",clf.best_score print "svm gamma value",clf.best_estimator.gamma print "svm C value",clf.best_estimator.C print "svm kernel",clf.best_estimator.kernel return clf
def score(testStr,candList1): batch_sz=1000 from sklearn.feature_extraction.text import TfidfVectorizer totCandidate=[];totInd=[] batch_num=int(math.ceil(len(candList)/float(batch_sz))) #51/50.0->2.0 for batch in range(batch_num)[:]: corpus=np.array(candList1)[batch*batch_sz:(batch+1)*batch_sz];#list print corpus[0],corpus[1] #'北京市 海淀区 西三旗' '人民日报社 爱玛 客 餐厅' ############# # tf idf vectorizer = TfidfVectorizer(ngram_range=(1,1),min_df=1) corpus=list(corpus) corpus.append(testStr) rst=vectorizer.fit_transform(corpus)#the last one is testStr #print vectorizer.get_feature_names() #for w in vectorizer.get_feature_names(): # print w ##no '客' rst=rst.toarray() #print 'feature',rst.shape #[n,dim] ################ # calculate distance test=rst[-1,:].reshape((1,-1))#[1,d] compare=rst[:-1,:] #[n,d] dist=calc_EuDistance(test,compare);#print 'eu-dist min max',np.min(dist),np.max(dist) rank=np.argsort(dist)[:50]#index ,from smallScore->largeScore sort candidateList=[corpus[ii] for ii in rank]#list totCandidate=totCandidate+candidateList # indList=[batch*batch_sz+ij for ij in rank] totInd=totInd+indList #score=dist[rank] #array #for i in range(len(candidateList))[:]: # print candidateList[i],'eu-dist',score[i] ############# print 'tot candidate',len(totCandidate) ################### # total candidate ## idf vectorizer = TfidfVectorizer(ngram_range=(1,1),min_df=1) corpus=totCandidate corpus.append(testStr) rst=vectorizer.fit_transform(corpus) rst=rst.toarray() # distance test=rst[-1,:].reshape((1,-1))#[1,d] compare=rst[:-1,:] #[n,d] dist=calc_EuDistance(test,compare) # pick up distance<=1.2 distInd=np.where(dist<=1.2)[0]#row index dist=dist[distInd] corpus=[corpus[ij] for ij in distInd] totIndArr=np.array(totInd)[distInd] # rank=np.argsort(dist)#[:20]#index ,from smallScore->largeScore sort candidateList=[corpus[ii] for ii in rank]#list score=dist[rank] #array for i in range(len(candidateList))[:]: print strUnique(candidateList[i]),'eu-dist',score[i] ############ return totIndArr
def vectorize(msg_input): from sklearn.feature_extraction.text import TfidfVectorizer from nltk.corpus import stopwords vectorizer = TfidfVectorizer(stop_words='english') vectorizer.fit_transform(msg_input) return vectorizer.get_feature_names()
def main(): start = time.time() args = get_args() if args.class_file: wid_to_class = OrderedDict() groups = OrderedDict() for line in args.class_file: splt = line.strip().split(',') groups[splt[1]] = groups.get(splt[1], []) + [int(splt[0])] wid_to_class[int(splt[0])] = splt[1] classes = groups.keys() logger.info(u"Loading CSV...") lines = [line.decode(u'utf8').strip() for line in args.infile if line.strip()] wid_to_features = OrderedDict([(int(splt[0]), u" ".join(splt[1:])) for splt in [line.split(u',') for line in lines] if int(splt[0]) in wid_to_class ]) unknowns = OrderedDict([(int(splt[0]), u" ".join(splt[1:])) for splt in [line.split(u',') for line in lines] if int(splt[0]) not in wid_to_class ]) logger.info(u"Vectorizing...") vectorizer = TfidfVectorizer() feature_keys, feature_rows = zip(*[(classes.index(wid_to_class[int(key)]), features) for key, features in wid_to_features.items() if int(key) in wid_to_class]) vectorizer.fit_transform(feature_rows) logger.info(u"Vectorized feature rows") training_vectors = vectorizer.transform(feature_rows).toarray() logger.info(u"Vectorized training features") logger.info(u"Training %d classifiers" % len(args.classifiers)) classifiers = dict() for classifier_string in args.classifiers: clf = Classifiers.get(classifier_string) classifier_name = Classifiers.classifier_keys_to_names[classifier_string] logger.info(u"Training a %s classifier on %d instances..." % (classifier_name, len(training_vectors))) clf.fit(training_vectors, feature_keys) classifiers[classifier_string] = clf logger.info(u"Trained.") for counter, (wid, unknown) in enumerate(unknowns.items()): prediction_matrix = [classifier.predict_proba(vectorizer.transform([unknown]).toarray()) for classifier in classifiers.values()] summed_probabilities = np.sum(prediction_matrix, axis=0)[0] unknown_class = classes[list(summed_probabilities).index(max(summed_probabilities))] args.outfile.write(u"%s,%s\n" % (wid, unknown_class)) if counter % 1000 == 0: logger.info(counter) logger.info(u"Finished in %.2f seconds" % (time.time() - start))
def train_test(args): # unpack arguments and make train/test data/label dicts/lists train, test, features, classifier = args # create tf idf spare matrix from training data if features == 'tfidf': fe = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_features=1290) trainfe = fe.fit_transform(train['data']) elif features == 'dict': fe = CountVectorizer(tokenizer=tokenize, stop_words='english', binary=True) trainfe = fe.fit_transform(train['data']) elif features == 'lsa': svd = TruncatedSVD(n_components=100, random_state=42) fe = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_df=0.115, max_features=11500) trainfe = svd.fit_transform(fe.fit_transform(train['data'])) elif features == 'rule': hamfe = CountVectorizer(tokenizer=tokenize, stop_words='english', max_features=1150) spamfe = CountVectorizer(tokenizer=tokenize, stop_words='english', max_features=1150) hamfit = hamfe.fit_transform(train['data'].loc[train['labels'] == 0]) spamfit = spamfe.fit_transform(train['data'].loc[train['labels'] == 1]) # train multinomial nb classifier on training data if classifier == 'mnb': from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB().fit(trainfe, train['labels']) elif classifier == 'gnb': from sklearn.naive_bayes import GaussianNB clf = GaussianNB().fit(trainfe.toarray(), train['labels']) elif classifier == 'svm': from sklearn.linear_model import SGDClassifier clf = SGDClassifier(loss='squared_hinge', penalty='l2').fit(trainfe, train['labels']) elif classifier == 'log': from sklearn.linear_model import SGDClassifier clf = SGDClassifier(loss='log', penalty='l2').fit(trainfe, train['labels']) elif classifier == 'rule': hamfeats = hamfe.transform(test['data']) spamfeats = spamfe.transform(test['data']) hyp = np.array(hamfeats.sum(axis=1) < spamfeats.sum(axis=1)).reshape(-1).T # extract features from test data if features == 'lsa': feats = svd.transform(fe.transform(test['data'])) else: feats = fe.transform(test['data']) # use trained classifier to generate class predictions from test features if classifier == 'gnb': hyp = clf.predict(feats.toarray()) elif classifier == 'rule': pass else: hyp = clf.predict(feats) # compare predictions with test labels score = np.mean(hyp == test['labels']) return score
def vectorize_on_dict(full_paper): """ receives a list of articles and vectorizes on keywords from articles""" articles = [' '.join(f.keywords) for f in full_paper] #articles = list(itertools.chain(*articles)) #print articles[0] #print articles vectorizer = TfidfVectorizer() vectorizer.fit_transform(articles) return vectorizer, vectorizer.fit_transform(articles)
class Corpus(object): def buildCorpus(self, region, time_interval, element_type='photos', paras={}): # time_interval should be [start, end] text = [] if element_type == 'photos': ei = PhotoInterface() cur = ei.rangeQuery(region, time_interval, 'caption.text') else: ei = TweetInterface() cur = ei.rangeQuery(region, time_interval, 'text') for t in cur: try: if element_type == 'photos': text.append(t['caption']['text']) else: text.append(t['text']) except: pass # it is not proper here to set up stopwords self._vectorizer = TfidfVectorizer(max_df=paras.get('max_df', 0.2), min_df=paras.get('min_df', 0.0), strip_accents=paras.get('strip_accents', 'ascii'), preprocessor=paras.get('preprocessor', tool.textPreprocessor), smooth_idf=paras.get('smooth_idf', True), sublinear_tf=paras.get('sublinear_tf', True), norm=paras.get('norm', 'l2'), analyzer=paras.get('analyzer', 'word'), ngram_range=paras.get('ngram_range', (1, 1)), stop_words=paras.get('stop_words', 'english') ) # If the program do not break here, we may ignore the bug try: self._vectorizer.fit_transform(text) except Exception as error : logging.warn(error) def getVectorizer(self): return self._vectorizer def chooseTopWordWithHighestTDIDF(self, text, k=10): voc = self._vectorizer.get_feature_names() tf_vec = self._vectorizer.transform([text]).mean(axis=0) nonzeros = np.nonzero(tf_vec)[1] res_list = nonzeros.ravel().tolist()[0] values = [] words = [] for n in res_list: words.append(voc[n]) values.append(tf_vec[0, n]) while len(values) < k: values.append(0) #return res_list, words, values return values
def get_feature_cosine_similarity(train): feature_prod_title = [] feature_prod_desc = [] #ensure the size is as required vect = TfidfVectorizer(min_df=1) for _ , row in train.iterrows(): cos_prod_title = vect.fit_transform([row["product_title"],row["search_term"]]) cos_prod_desc = vect.fit_transform([row["product_description"],row["search_term"]]) feature_prod_title.append((cos_prod_title*cos_prod_title.T).A[0][1]) feature_prod_desc.append((cos_prod_desc*cos_prod_desc.T).A[0][1]) return feature_prod_title,feature_prod_desc
def init_tfidf(self): train_data = pd.read_csv('%s/train.csv' % self.config.get('DIRECTORY', 'origin_pt')).fillna(value="") # [:100] test_data = pd.read_csv('%s/test.csv' % self.config.get('DIRECTORY', 'origin_pt')).fillna(value="") # [:100] tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 1)) tfidf_txt = pd.Series( train_data['question1'].tolist() + train_data['question2'].tolist() + test_data['question1'].tolist() + test_data['question2'].tolist()).astype(str) tfidf.fit_transform(tfidf_txt) LogUtil.log("INFO", "init tfidf done ") return tfidf
def get_X_train(data, wn=False, ignore=False, max_n_gram=1, lowercase=True, nopunc=False, lemmatize=False, stem=False, remove_stop_words=True, tfidf=False, verbose=True): if verbose: print('Using n-grams of up to %d words in length' % max_n_gram) if lowercase and verbose: print('Converting all text to lowercase') if lemmatize: tokenizer = LemmaTokenizer(nopunc) if verbose: print('Lemmatizing all words') elif stem: tokenizer = StemTokenizer(nopunc) if verbose: print('Stemming all words') else: tokenizer = None if remove_stop_words: stop_words = 'english' if verbose: print('Removing English stop words') else: stop_words = None t0 = time() if tfidf: if verbose: print() print('Extracting features from the test data using a tfidf vectorizer') vectorizer = TfidfVectorizer(lowercase=lowercase, tokenizer=tokenizer, stop_words=stop_words, ngram_range=(1, max_n_gram)) X_train = vectorizer.fit_transform(data) else: if verbose: print('Extracting features from the test data using a count vectorizer') vectorizer = CountVectorizer(lowercase=lowercase, tokenizer=tokenizer, stop_words=stop_words, ngram_range=(1, max_n_gram)) if wn: print('Learning a vocabulary dictionary with a count vectorizer') vectorizer.fit(data) print('Done learning vocabulary dictionary') vectorizer = WordNetVectorizer(vectorizer) print('Getting wordnet based feature vectors...') X_train = vectorizer.get_word_net_feature_vecs(data, ignore) print('Done getting wordnet based feature vectors') else: X_train = vectorizer.fit_transform(data) duration = time() - t0 if verbose: data_train_size_mb = size_mb(data) print('done in %fs at %0.3fMB/s' % (duration, data_train_size_mb / duration)) print('n_samples: %d, n_features: %d' % X_train.shape) print() return X_train, vectorizer
class Cluster: def __init__(self): self.train_file = os.path.join('data', 'sample') def run_main(self): self.load_data() self.vectorize() #KMeans - K++ print "KMeans - K++" self.kmeans = KMeans(n_clusters=3, init='k-means++', n_init=10000) self.train() self.get_metrics() #MiniBatchKMeans - K++ print "MiniBatchKMeans - K++" self.kmeans = MiniBatchKMeans(n_clusters=3, init='k-means++', n_init=10000) self.train() self.get_metrics() #KMeans - Random print "KMeans - Random" self.kmeans = KMeans(n_clusters=3, init='random', n_init=10000) self.train() self.get_metrics() #MiniBatchKMeans - K++ print "MiniBatchKMeans - Random" self.kmeans = MiniBatchKMeans(n_clusters=3, init='random', n_init=10000) self.train() self.get_metrics() def load_data(self): self.training_data = [] with open(self.train_file, 'r') as fd: for line in fd.readlines(): self.training_data.append(line) def vectorize(self): self.vect = TfidfVectorizer(stop_words='english') self.X = self.vect.fit_transform(self.training_data) def train(self): self.kmeans.fit(self.X) def get_metrics(self): print self.kmeans.labels_ def test(self): self.test_data = ["I know both Ashok and Harini"] self.Y = self.vect.fit_transform(self.test_data) print self.kmeans.predict(self.Y)
def tf_idf_threading(table,dates): for date in dates: corpus = reviews[date] if len(corpus) == 1 and len(corpus[0]) == 1: continue print("TF-IDF processing " + date) vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english') vectorizer.fit_transform(corpus) idf = vectorizer._tfidf.idf_ table[date] = dict(zip(vectorizer.get_feature_names(), idf)) return
vectorizer = CountVectorizer(max_features=2000) ingredients = train['ingredients'] words_list = [' '.join(x) for x in ingredients] #Make label encoder le = preprocessing.LabelEncoder() le.fit(train["cuisine"]) #create a bag of words and convert to a array and then print the shape bag_of_words = vectorizer.fit(words_list) bag_of_words = vectorizer.transform(words_list).toarray() print(bag_of_words.shape) vectorizertfidf = TfidfVectorizer(min_df=1) tfidf = vectorizertfidf.fit_transform(words_list).toarray() print tfidf.shape X = bag_of_words y = le.transform(train["cuisine"]) print X.shape print y.shape dtrain = xgb.DMatrix(X, label=y) n_classes = len(list(set(y))) param = { 'max_depth': 14, 'eta': 1,
"""# Content Based ## TF-IDF """ movies.genres = movies.genres.str.split('|') movies.head() movies.genres = movies.genres.fillna("").astype('str') movies.head() from sklearn.feature_extraction.text import TfidfVectorizer tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english') tfidf_matrix = tf.fit_transform(movies['genres']) tfidf_matrix.shape tfidf_matrix print(tf.get_feature_names()) """## Cosine Similarity""" from sklearn.metrics.pairwise import cosine_similarity sim = cosine_similarity(tfidf_matrix) sim.shape sim[:4, :4]
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV from sklearn.svm import SVC as svc from sklearn.metrics import make_scorer, roc_auc_score from scipy import stats from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.model_selection import train_test_split # DATA PREPARATION data = pd.read_csv('/home/sergi/CityRoad_Disruptions/DataSet/preprocessed.csv', sep='\t', lineterminator='\n') text = data['Text'] target = data['Class'] tf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2)) train_matrix = tf_vectorizer.fit_transform(text.values.astype('U')).toarray() X_train, X_test, y_train, y_test = train_test_split(train_matrix, target, test_size=.3) # DEFINE MODEL AND PERFORMANCE MEASURE mdl = svc(probability = True, random_state = 1) auc = make_scorer(roc_auc_score) # GRID SEARCH FOR 20 COMBINATIONS OF PARAMETERS grid_list = {"C": np.arange(2, 10, 2), "gamma": np.arange(0.1, 1, 0.2)} grid_search = GridSearchCV(mdl, param_grid = grid_list, n_jobs = 4, cv = 3, scoring = auc) grid_search.fit(X_train, y_train) grid_search.cv_results_ def report(results, n_top=3):
def tfidf_vec(corpus): tfidf = TfidfVectorizer() train_vec = tfidf.fit_transform(corpus) # for test data # tfidf.transform(['ya Allah meri sister Affia ki madad farma', 'khud chahta a is umar main shadi']) return train_vec, tfidf
for row in dataset.itertuples(): # make sure its a string if isinstance(row.SYMPTOM_TEXT, str): curr_words = [] tokenized = nltk.tokenize.word_tokenize(row.SYMPTOM_TEXT) for word in tokenized: pre = preprocess(word) if pre != '': pre = nltk.PorterStemmer().stem(pre) curr_words.append(pre) sentences.append(' '.join(curr_words)) vect = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7) tfidf_matrix = vect.fit_transform(sentences).toarray() feature_names = vect.get_feature_names() dataset['SERIOUS'][100:1000] = 'Y' X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, dataset['SERIOUS'], test_size=0.2, random_state=0) classifier = RandomForestClassifier(n_estimators=1000, random_state=0) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) print(confusion_matrix(y_test, y_pred))
# In[24]: df.columns # # Step 2: TF-IDF Factorization of text coloumn # In[25]: from sklearn.feature_extraction.text import TfidfVectorizer v = TfidfVectorizer(sublinear_tf=True, max_df=0.3,min_df=0.10,max_features=500, analyzer='word', stop_words='english', ngram_range =(1,3), use_idf = True) x = v.fit_transform(df['text']) # In[26]: x # In[27]: v # In[28]:
bagOfWordsA = x.split() for word in bagOfWordsA: numOfWordsA[word] += 1 return numOfWordsA # array_negative=count_vectorizer.fit_transform(data_negative.splitlines()) # array_positive=count_vectorizer.fit_transform(data_positive.splitlines()) # tfidf_negative = tfidf_vector.fit_transform(data_negative.splitlines()) # tfidf_positive = tfidf_vector.fit_transform(data_positive.splitlines()) # print(tfidf_negative) data = open("test.txt").read().splitlines() tf_idf = tfidf_vector.fit_transform(data) print(tfidf_vector.get_feature_names()) array_train = [] list = [] for i in data_negative.splitlines(): array_train.append(0) for i in data_positive.splitlines(): array_train.append(1) print(tf_idf.toarray()) # X = count_vectorizer.fit_transform(data) # tfidf_vector # print(count_vectorizer.vocabulary_) # array=X.toarray() # print(array)
columns=['category', 'Train Count', 'Test Count' ]).sort_values(by=['Train Count', 'Test Count'], ascending=False)) # ### TD_IDF Model # + from sklearn.feature_extraction.text import TfidfVectorizer tv = TfidfVectorizer(min_df=0., max_df=1., norm="l2", use_idf=True, smooth_idf=True) tv_train_features = tv.fit_transform(train_corpus) tv_test_features = tv.transform(test_corpus) print('TF-IDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape) # - tv_matrix = tv_train_features.toarray() vocab = tv.get_feature_names() pd.DataFrame(np.round(tv_matrix, 2), columns=vocab) # ### ML algorithms on TF-IDF model import time import warnings warnings.filterwarnings('ignore')
data.append(twt) labels.append(c) print feature, twt # break L = len(full_data) random.shuffle(full_data) train_data = [i[1] for i in full_data[:int(0.8 * L)]] train_features = [i[0] for i in full_data[:int(0.8 * L)]] train_labels = [i[2] for i in full_data[:int(0.8 * L)]] test_data = [i[1] for i in full_data[int(0.8 * L):]] test_features = [i[0] for i in full_data[int(0.8 * L):]] test_labels = [i[2] for i in full_data[int(0.8 * L):]] vectorizer = TfidfVectorizer(min_df=5, max_df=0.8, sublinear_tf=True, use_idf=True, decode_error='ignore') train_vectors = vectorizer.fit_transform(train_data) test_vectors = vectorizer.transform(test_data) final_train = np.hstack([train_features, train_vectors.toarray()]) final_test = np.hstack([test_features, test_vectors.toarray()]) print final_train # classifier_rbf = svm.SVC(kernel='rbf') # classifier_rbf.fit(final_train,train_labels) # prediction_rbf=classifier_rbf.predict(final_test) # print(classification_report(test_labels, prediction_rbf)) # print(accuracy_score(test_labels, prediction_rbf))
from nlpia.data.loaders import harry_docs as docs from sklearn.feature_extraction.text import TfidfVectorizer corpus = docs vectorizer = TfidfVectorizer(min_df=1) model = vectorizer.fit_transform(corpus) print(model.todense().round(2))
def train_sklearn(): good_heads = [ "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16" ] bad = pandas.read_csv("bad.csv", delimiter=";")['address'].sample(frac=1) good = pandas.read_csv("good.csv", delimiter=";", names=good_heads)["2"].sample(frac=1) train_good = good[:130000] train_bad = bad[:130000] test_good = good[130000:] test_bad = bad[130000:] train_data = [] for i in train_good: train_data.append([i, 1]) for i in train_bad: train_data.append([i, 0]) test_data = [] for i in test_good: test_data.append([i, 1]) for i in test_bad: test_data.append([i, 0]) np.random.shuffle(train_data) np.random.shuffle(test_data) train_x = [] train_y = [] for i in train_data: train_x.append(i[0]) train_y.append(i[1]) test_x = [] test_y = [] for i in test_data: test_x.append(i[0]) test_y.append(i[1]) vectorizer = TfidfVectorizer(min_df=5) train_x = vectorizer.fit_transform(train_x) test_x = vectorizer.transform(test_x) model = LogisticRegression(random_state=42) # model = GradientBoostingClassifier(n_estimators=250, random_state=42, verbose=1, max_features='sqrt') # model = RandomForestClassifier(n_estimators=10, verbose=1, random_state=241, n_jobs=-1, max_features='sqrt') model.fit(train_x, train_y) # scores_train = list(map(lambda i: roc_auc_score(train_y, i[:, 1]), list(model.staged_predict_proba(train_x)))) # scores_test = list(map(lambda i: roc_auc_score(test_y, i[:, 1]), list(model.staged_predict_proba(test_x)))) # scores_train = list(model.staged_predict_proba(train_x)) # scores_test = list(model.staged_predict_proba(test_x)) # plt.figure() # plt.plot(scores_train, 'r', linewidth=2) # plt.plot(scores_test, 'g', linewidth=2) # plt.legend(['test', 'train']) # plt.show() # score = roc_auc_score(test_y, model.predict_proba(test_x)[:, 1]) pred = model.predict(test_x) score_f1 = f1_score(test_y, pred) score_recall = recall_score(test_y, pred) score_accuracy = accuracy_score(test_y, pred) print("f1: ", score_f1) print("recall: ", score_recall) print("accuracy: ", score_accuracy) if not os.path.exists('models'): os.makedirs('models') dump(model, 'models/lr1.model') dump(vectorizer, 'models/vectorizer1.model')
authors = pickle.load(open(authors_file, "r")) ### test_size is the percentage of events assigned to the test set (the ### remainder go into training) ### feature matrices changed to dense representations for compatibility with ### classifier functions in versions 0.15.2 and earlier #from sklearn import cross_validation from sklearn.model_selection import train_test_split features_train, features_test, labels_train, labels_test = train_test_split( word_data, authors, test_size=0.1, random_state=42) from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train = vectorizer.fit_transform(features_train) features_test = vectorizer.transform(features_test).toarray() ### a classic way to overfit is to use a small number ### of data points and a large number of features; ### train on only 150 events to put ourselves in this regime features_train = features_train[:150].toarray() labels_train = labels_train[:150] ### your code goes here from sklearn import tree from sklearn.metrics import accuracy_score clf = tree.DecisionTreeClassifier() clf.fit(features_train, labels_train) pred = clf.predict(features_test)
with open('datasets\\dataset_10.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') documents = [""] comptageLigne = 0 for row in csv_reader: documents = numpy.append(documents, [row[3]]) comptageLigne += 1 documents = numpy.append(documents, [row[3]]) comptageLigne += 1 csv_file.close() #Initialisation de la liste des mots à igniorer lors de la recherche dans le fichier listeCompleteMotsBloques = stopwords.words('english') + stopwords.words( 'spanish') vectoriseur = TfidfVectorizer(stop_words=listeCompleteMotsBloques) X = vectoriseur.fit_transform(documents) #Initalisation de la recherche de clusters nombreClusters = 100 modele = KMeans(n_clusters=nombreClusters, init='k-means++', max_iter=6000, n_init=1) modele.fit(X) ordreCentroides = modele.cluster_centers_.argsort()[:, ::-1] termes = vectoriseur.get_feature_names() #Détection des clusters et enregistrement dans le fichier motsClusters.csv with open('results\\motsClusters.csv', mode='w') as clusters_file: motsClusters = csv.writer(clusters_file,
config = DBConfig(working_dir + "/db.ini").read_db_config() # Open database connection db = MySQLdb.connect(**config) data = sql.read_sql(queryAnswers % question_id, db) cursor = db.cursor() vectorizer = TfidfVectorizer(tokenizer=process_text, stop_words=stopwords, max_df=max_df, min_df=min_df, use_idf=True, lowercase=True) docs = data['Value'].tolist() ids = data['ID'].tolist() tfidf_model = vectorizer.fit_transform(docs) km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100000, n_init=1) km.fit(tfidf_model) clusters = km.labels_.tolist() # create main data frame frame = pd.DataFrame({ 'ids': ids, 'answers': docs, 'cluster': clusters },
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity s1 = 'The field of study that focuses on the interactions between human language and computers is called Natural Language Processing, or NLP for short' s2 = 'Natural Language Processing is a field that covers computer understanding and manipulation of human language, and its ripe with possibilities for new gathering' s3 = 'NLP is a way for computers to analyze, understand, and derive meaning from human language in a smart and useful way' s4 = 'NLP is characterized as a hard problem in computer science.' s5 = 'NLP algorithms are typically based on machine learning algorithms. Instead of hand-coding large sets of rules' query = 'NLP sits at the intersection of computer science, artificial intelligence, and computational linguistics' tfidf = TfidfVectorizer() dataset = [query, s1, s2, s3, s4, s5] matrix = tfidf.fit_transform(dataset) dic = tfidf.vocabulary_ for key in dic.keys(): print('{0} {1}'.format(key, dic[key])) print(cosine_similarity(matrix[0:1], matrix))
data.columns = ['labels', 'texts'] # Explore the dataset print('Out of {} rows, {} are spam, {} are ham'.format(len(data), len(data[data['labels']=='spam']), len(data[data['labels']=='ham']))) # Check the Number of missing data print('Number of null in labels: {} and number of null in texts: {}'.format(data['labels'].isnull().sum(), data['texts'].isnull().sum())) # stopwords removal stopwords = nltk.corpus.stopwords.words('english') # Wordnetlemmatizer wm = nltk.WordNetLemmatizer() # pre-processing data def data_clean(texts): text = "".join([char for char in texts if char not in string.punctuation]) tokens = re.split('W+', text) text = [wm.lemmatize(word) for word in tokens if word not in stopwords] return text data['cleaned_text'] = data['texts'].apply(lambda x: data_clean(x.lower())) # Vectorizing tfidf_vect = TfidfVectorizer(analyzer=data_clean) X_tfidf = tfidf_vect.fit_transform(data['cleaned_text']) import ipdb; ipdb.set_trace() print(X_tfidf.shape, tfidf_vect.get_feature_names())
#%% # Bag of word #%% corpus = [] train_corpus = [] test_corpus = [] for text in train: corpus.append(" ".join(text)) for text in xtrain: train_corpus.append(" ".join(text)) for text in xtest: test_corpus.append(" ".join(text)) #%% from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer() trainData = vectorizer.fit_transform(corpus) xtrain1 = vectorizer.transform(train_corpus) xtest1 = vectorizer.transform(test_corpus) xtrain1 = xtrain1.toarray() xtest1 = xtest1.toarray() #%% from sklearn import linear_model log = linear_model.LogisticRegression(C=1).fit(xtrain1, ytrain) #%% from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score y_log1 = log.predict(xtest1) y_log2 = log.predict(xtrain1) print("Accuracy score is: ", accuracy_score(y_log1, ytest)) print("F1 score is: ", f1_score(y_log1, ytest, average="macro")) print("precision score is: ", precision_score(y_log1, ytest, average="macro")) print("recall score is: ", recall_score(y_log1, ytest, average="macro"))
if 'MLCOMP_DATASETS_HOME' not in os.environ: print "MLCOMP_DATASETS_HOME not set; please follow the above instructions" sys.exit(0) # Load the training set print "Loading 20 newsgroups training set... " news_train = load_mlcomp('20news-18828', 'train') print news_train.DESCR print "%d documents" % len(news_train.filenames) print "%d categories" % len(news_train.target_names) print "Extracting features from the dataset using a sparse vectorizer" t0 = time() vectorizer = TfidfVectorizer(charset='latin1') X_train = vectorizer.fit_transform( (open(f).read() for f in news_train.filenames)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape assert sp.issparse(X_train) y_train = news_train.target print "Loading 20 newsgroups test set... " news_test = load_mlcomp('20news-18828', 'test') t0 = time() print "done in %fs" % (time() - t0) print "Predicting the labels of the test set..." print "%d documents" % len(news_test.filenames) print "%d categories" % len(news_test.target_names) print "Extracting features from the dataset using the same vectorizer"
def get_vocabulary(self, linked_pages, categories_links): """Scrapp a wiki page to get vocabulary for each category""" total_vocabulary = {} unique_vocabulary = [] unique_vocabulary_tfidf = [] # For each category for parent, pages in linked_pages.items(): children_pages = [] downloaded_pages = [] # For every pages linked to this wategory on Wiki for page in pages: sys.stdout.write('\t{} / {} pages downloaded for [{}] category.\r'.format(len(children_pages)-1, len(pages), parent)) sys.stdout.flush() # Get data wiki_url = 'https://en.wikipedia.org/wiki/{}'.format(page) data = requests.get(wiki_url) data_soup = BeautifulSoup(data.text, 'html.parser') paragraphs = [str(paragraph) for paragraph in data_soup.find_all('p')] paragraphs_joined = ' '.join(paragraphs) # Clean, tokenize, stemm and rebuild the document page_vocabulary = [] cleaned_data = self.clean_xml(text=paragraphs_joined.strip()) tokenized_data = self.tokenizer.tokenize(cleaned_data) for token in tokenized_data: if token.lower() not in self.stopwords: word = self.lemmatizer.lemmatize(token.lower()) # Check if the word is correct if self.english_dict.check(word) is True: page_vocabulary.append(word) # Track total vocabulary if word not in unique_vocabulary: unique_vocabulary.append(word) # Here, why not Levenstein for correction, but gonna be long page_nlp_treated = ' '.join(page_vocabulary) if len(children_pages) >= self.configuration['options']['pages_per_category'] or len(children_pages) == len(pages): break else: children_pages.append(page_nlp_treated) downloaded_pages.append(page) # Wikipedia is cool, be cool with their servers. time.sleep(self.configuration['options']['waiting_time']) # StdOut summary print('\n\t\t- ' + '\n\t\t- '.join(downloaded_pages)) # TF_IDF for vocabulary of each category and get top score tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), min_df=0, stop_words=self.stopwords) try: tfidf_matrix = tf.fit_transform(children_pages) except ValueError: # In case of an old empty page continue feature_names = tf.get_feature_names() dense = tfidf_matrix.todense() episode = dense[0].tolist()[0] phrase_scores = [pair for pair in zip(range(0, len(episode)), episode) if pair[1] > 0] sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1) category_words = [] for word, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:self.configuration['options']['word_per_page']]: category_words.append({word: score}) if word not in unique_vocabulary_tfidf: unique_vocabulary_tfidf.append(word) # Get linked categories to category linked_categories = [] for relation in relations: if relation[0] == parent and relation[0] not in linked_categories: linked_categories.append(relation[1]) if relation[1] == parent and relation[1] not in linked_categories: linked_categories.append(relation[0]) # Get linked pages to category for category, pages in linked_pages.items(): if category == parent: linked_pages_to_category = pages category_details = {} category_details['terminology'] = category_words category_details['linked_pages_to_category'] = linked_pages_to_category category_details['linked_categories'] = linked_categories total_vocabulary[parent] = category_details # Statistics about our terminology print('\nA total of {} words have been scanned to extract {} important words covering {} categories.'.format(len(unique_vocabulary), len(unique_vocabulary_tfidf), len(linked_pages))) return total_vocabulary
# only one document or in at least 95% of the documents are removed. print("Loading dataset...") t0 = time() data_samples = pd.read_csv('data/raw_data/dataset.csv', sep=';', index_col=0)['Required skill'].tolist() print("done in %0.3fs." % (time() - t0)) # Use tf-idf features for NMF. print("Extracting tf-idf features for NMF...") tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') t0 = time() tfidf = tfidf_vectorizer.fit_transform(data_samples) print("done in %0.3fs." % (time() - t0)) # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA...") tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') t0 = time() tf = tf_vectorizer.fit_transform(data_samples) print("done in %0.3fs." % (time() - t0)) print() # Fit the NMF model print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
doc = normalize_corpus_words([text.lower()], synonyms=synonyms, stopwords=stopwords)[0] stems = [w for w in doc.split() if w in vocabulary] return stems fun_words = vocabulary = 'cat dog apple lion nyc love big small' fun_stems = normalize_corpus_words([fun_words])[0].split()[:NUM_WORDS] fun_words = fun_words.split() if SAVE_SORTED_CORPUS: tfidfer = TfidfVectorizer(min_df=2, max_df=.6, stop_words=None, token_pattern=r'(?u)\b\w+\b') corpus = get_data('cats_and_dogs')[:NUM_DOCS] docs = normalize_corpus_words(corpus, stemmer=None) tfidf_dense = pd.DataFrame(tfidfer.fit_transform(docs).todense()) id_words = [(i, w) for (w, i) in tfidfer.vocabulary_.items()] tfidf_dense.columns = list(zip(*sorted(id_words)))[1] word_tfidf_dense = pd.DataFrame(tfidfer.transform(fun_stems).todense()) word_tfidf_dense.columns = list(zip(*sorted(id_words)))[1] word_tfidf_dense.index = fun_stems """ >>> word_tfidf_dense[fun_stems] cat dog anim pet citi appl nyc car bike hat cat 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 dog 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 anim 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 pet 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 citi 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
print("%d documents" % len(dataset.data)) print("%d categories" % len(dataset.target_names)) print() labels_ture = dataset.target true_k = np.unique(labels_ture).shape[0] print("Extracting features from the training dataset " "using a sparse vectorizer") #矩阵和权值 vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000, min_df=2, stop_words='english', use_idf=True) matrix = vectorizer.fit_transform(dataset.data) print("n_samples: %d, n_features: %d" % matrix.shape) print() #降维 print("Performing dimensionality reduction using LSA") t0 = time() svd = TruncatedSVD(2) #维度 normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) matrix_l = lsa.fit_transform(matrix) # 2D embedding of the digits dataset
def main(): args = sys.argv # param = Params(args[0]) # csv_file = "/home/thiagodepaulo/exp/text-collections/Sequence_of_words_CSV/CSTR.csv" # n_pos = 5 # k = 4 # local_itr = 10 # global_itr = 10 # alpha = 0.05 # beta = 0.0001 csv_file = args[1] n_pos = int(args[2]) k = int(args[3]) local_itr = float(args[4]) global_itr = float(args[5]) alpha = float(args[6]) beta = float(args[7]) loader = pbg.util.Loader() X, y = loader.load_csv(csv_file, text_column="Text", class_column="Class") target_name = list(set(y)) n_class = len(target_name) vect = TfidfVectorizer() X = vect.fit_transform(X) model = TPBG( k, alpha=alpha, beta=beta, local_max_itr=local_itr, global_max_itr=global_itr, local_threshold=1e-6, global_threshold=1e-6, save_interval=-1, feature_names=vect.get_feature_names_out(), silence=False, ) # selecionar aleatoriamente uma classe e n_pos exemplo rotulado choosed_cls = target_name[randint(0, n_class - 1)] selected_idx = np.random.choice(np.where(y == choosed_cls)[0], size=n_pos, replace=False) # marcar com -1 todo o restante y_train = np.copy(y) y_train[[i for i in range(len(y)) if i not in selected_idx]] = -1 X_test, y_test = remove_rows(X, y, selected_idx) def eval_func(model): y_predict = model.predict(X_test) y_predict = [1 if c == choosed_cls else 0 for c in y_predict] y_test2 = [1 if c == choosed_cls else 0 for c in y_test] # calcular a métrica labels = [0, 1] names = ["others", choosed_cls] report = classification_report(y_test2, y_predict, labels=labels, target_names=names) print('\n' + report + '\n') # insere função de avaliação model.eval_func = eval_func # treinar o modelo model.fit(X, y_train)
def remote_css(url): st.markdown(f'<link href="{url}" rel="stylesheet">', unsafe_allow_html=True) local_css("style.css") remote_css('https://fonts.googleapis.com/icon?family=Material+Icons') # search_input = st.text_input("Enter keyword/s", "") button_clicked = st.button("Go") df['Title'] = df['Title'].astype(str) df['Keywords'] = df['Keywords'].astype(str) tfidf = TfidfVectorizer() tfidf_features = tfidf.fit_transform(df.Title) df = df.astype({'Dominant_Topic': int}) df_topics = df.groupby(['Dominant_Topic', 'Keywords']).size().to_frame().reset_index() topics = df_topics[['Dominant_Topic', 'Keywords']] topics_dict = topics.set_index('Dominant_Topic').T.to_dict('list') keys_values = topics_dict.items() new_dict = {int(key): str(value) for key, value in keys_values} labels_map = new_dict X = df['Title'].astype(str) y = df['Dominant_Topic'] random.seed(42) X_train, X_test, y_train, y_test = train_test_split(X, y,
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from sklearn.metrics import adjusted_rand_score captions = [] caption_file = open("cap.txt", encoding="utf8") for caption in caption_file: captions.append(caption.split(' ', 1)[1]) vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform(captions) true_k = 5 model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) model.fit(X) print("Top terms per cluster:") order_centroids = model.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(true_k): print("Cluster %d:" % i), for ind in order_centroids[i, :10]: print(' %s' % terms[ind]), print("\n") print("Prediction") Y = vectorizer.transform(["FLOOD WATERS CHURNING JUST UNDER THE DUCK RIVER BRIDGE ON HIGHWAY 50 NEAR I-40 EXIT 148."]) prediction = model.predict(Y) print("FLOOD WATERS CHURNING JUST UNDER THE DUCK RIVER BRIDGE ON HIGHWAY 50 NEAR I-40 EXIT 148.") print(prediction)
### use str.replace() to remove any instances of the words ### ["sara", "shackleton", "chris", "germani"] ### append the text to word_data word_data.append(text) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == "sara": from_data.append(0) else: from_data.append(1) email.close() print "emails processed" from_sara.close() from_chris.close() pickle.dump( word_data, open("your_word_data.pkl", "w") ) pickle.dump( from_data, open("your_email_authors.pkl", "w") ) print word_data[152] ### in Part 4, do TfIdf vectorization here from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(stop_words="english") transformed_word_data = vectorizer.fit_transform(word_data) print "count of words: ", len(vectorizer.get_feature_names()) print "word 34597: ", vectorizer.get_feature_names()[34597]
def main(): process_resume_list() save_model = 'finalized_model.sav' save_vector = 'finalized_vectorizer.sav' if __name__ == '__main__': main() label=np.array(labelList) vectorizer = TfidfVectorizer(analyzer='word', stop_words='english',max_features=250) resumes_train,resumes_test,y_train,y_test=train_test_split(resume_list,label,test_size=0.33,random_state=1) X_train = vectorizer.fit_transform(resumes_train) X_test = vectorizer.fit_transform(resumes_test) X_train_array = X_train.toarray() X_test_array = X_test.toarray() y_test1=y_test.reshape(-1,1) print(vectorizer.get_feature_names()) pickle.dump(vectorizer, open(save_vector, 'wb')) #Implementing Bernoulli Naive Bayes naive_bayes = BernoulliNB(alpha=1.0) naive_bayes.fit(X_train_array, y_train) predictions = naive_bayes.predict(X_test_array) naivescore=(naive_bayes.score(X_test_array, y_test1))*100
def getFeature(foldername): filenamelist = [] # foldername = 'ratings2020' for subdir, dirs, files in os.walk(foldername): for file in os.listdir(subdir): filepath = subdir + os.sep + file re.sub(r"\\", "/", filepath) if ".csv" in filepath: filenamelist.append(filepath) # ----------------> Merging all the data in one csv df_merged = (pd.read_csv(filepath_or_buffer=file, sep=',', encoding='utf-16', error_bad_lines=False, engine='python') for file in filenamelist) df_merged = pd.concat(df_merged, ignore_index=True) df_merged.to_csv("merged.csv") df_merged.columns = [ column.replace(" ", "_") for column in df_merged.columns ] df = df_merged[[ "Star_Rating", "Reviewer_Language", "Review_Text", "App_Version_Code" ]] pd.set_option('mode.chained_assignment', None) # to remove SettingwithcopyWarning df['Positively_Rated'] = np.where(df['Star_Rating'] >= 3, 1, 0) # @@@@@@@@@@@@@@@@@@@ UI FEATURE 1: @@@@@@@@@@@@@@@@@@@@@@@@@@ total_rating = len(df['Star_Rating']) pd.set_option('mode.chained_assignment', None) df.dropna(inplace=True, how='any') total_reviews = len(df(l1['Review_Text'])) # In version 1.0 , we'll be checking only english revviews.... df = df[df.Reviewer_Language == 'en'] # Telling the positive and negative Cont and propotion for a particular version latest_version = max(df["App_Version_Code"]) VrsnRating = df[df.App_Version_Code == latest_version].Positively_Rated.mean() VrsnRating = round(VrsnRating * 100, 2) ########## DATA CLEANING ##################333 df['Review'] = df['Review_Text'].apply(lambda x: x.lower()) df['Review'] = df['Review'].apply( lambda x: re.sub(r"\W", " ", x)) # non -word charactrer df['Review'] = df['Review'].apply( lambda x: re.sub(r"\d", " ", x)) # removing digits df['Review'] = df['Review'].apply( lambda x: re.sub("([^\x00-\x7F])+", " ", x)) # removing emojis df['Review'] = df['Review'].apply( lambda x: re.sub(' \w{1,4} ', ' ', x)) # removing 2 char wrds df['Review'] = df['Review'].apply(lambda x: re.sub(r"\s+", " ", x)) df['Review'] = lemma(df['Review']) df['Review'] = df['Review'].apply(stp) nan_value = float("NaN") df.replace("", nan_value, inplace=True) df.dropna(inplace=True) df.isnull() df['Review'] = tagme(df['Review']) sid = SentimentIntensityAnalyzer() df["sentiments"] = df["Review_Text"].apply(lambda x: sid.polarity_scores( x)) #'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound':.. df = pd.concat( [df.drop(['sentiments'], axis=1), df['sentiments'].apply(pd.Series)], axis=1) # add number of characters column df["nb_chars"] = df["Review_Text"].apply(lambda x: len(x)) # add number of words column df["nb_words"] = df["Review_Text"].apply(lambda x: len(x.split(" "))) documents = [ TaggedDocument(doc, [i]) for i, doc in enumerate(df["Review"].apply( lambda x: str(x).split(" "))) ] # train a Doc2Vec model with our text data model = Doc2Vec(documents, vector_size=30, window=2, min_count=1, workers=4) # transform each document into a vector data doc2vec_df = df["Review"].apply( lambda x: model.infer_vector(str(x).split(" "))).apply(pd.Series) doc2vec_df.columns = [ "doc2vec_vector_" + str(x) for x in doc2vec_df.columns ] df = pd.concat([df, doc2vec_df], axis=1) corpus = [] for sentences in df["Review"]: corpus.append([word for word, tag in sentences]) df['cln_Reviews'] = [" ".join(review) for review in corpus] # add tf-idfs columns tfidf = TfidfVectorizer( min_df=5) # ignore terms appearing less than 5 documents tfidf_result = tfidf.fit_transform(df["cln_Reviews"]).toarray() tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf.get_feature_names()) tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns] tfidf_df.index = df.index reviews_df = pd.concat([df, tfidf_df], axis=1) wrdcldimg = show_wordcloud_fn(corpus) best_negsentences = reviews_df[reviews_df["nb_words"] >= 5].sort_values( "neg", ascending=False)[["Review_Text"]].head() #best_negsentences = reviews_df.sort_values("neg", ascending=False)[["Review_Text"]].head() best_negsentences = best_negsentences.to_string(index=False) pos_best_sentences = reviews_df[reviews_df["nb_words"] >= 5].sort_values( "pos", ascending=False)[["Review_Text"]].head() #pos_best_sentences = reviews_df.sort_values("pos", ascending=False)[["Review_Text"]].head() pos_best_sentences = pos_best_sentences.to_string(index=False) # apprtngimg = appvsrating(reviews_df) return (best_negsentences, pos_best_sentences, total_rating, total_reviews, VrsnRating, latest_version, wrdcldimg)
v = TfidfVectorizer(stop_words='english', analyzer="word", use_idf=True, min_df=1, smooth_idf=True, norm='') base = pd.read_csv("films.csv") # pridanie novych prazdnych stlpcov z tfidf base['tfidf1'] = 0 base['tfidf2'] = 0 base['tfidf3'] = 0 base['tfidf4'] = 0 # pocitanie idf x = v.fit_transform(base.loc[:, 'storyline'].values.astype('U')) idf = v.idf_ # zrobenie dictionary v tvare -> token : hodnota idf dictineri = dict(zip(v.get_feature_names(), idf)) for i, row in base.iterrows(): accStoryline = list(map(lambda x: x.lower(), row['storyline'].split())) trol = dict() # ulozenie hodnot tfidf s tokenmi do trola for accWord in accStoryline: foo = accWord.replace('.', '') if foo in dictineri: if foo in trol: trol[foo] += dictineri[foo] else: trol[foo] = dictineri[foo]
data = pd.read_csv("D:/BERKELEY-GRADUATE/E295/round4/expansion_4_1.csv") #data = data.dropna(axis=0, how='any') abstract = data.Abstract title = data.Title #abstract_title = pd.Series() #for i in range(len(title)): #abstract_title[str(i)] = title[i] + abstract[i] tf = abstract ############################################################################### #remove dominant words ##td-idf####################################################################### from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(stop_words = 'english') X = vectorizer.fit_transform(tf) dense_X = X.todense() idf = vectorizer.idf_ featurename1 = vectorizer.get_feature_names() #print(dict(zip(vectorizer.get_feature_names(), idf))) #get dominant words one = dense_X > 0 frequency1 = sum(one) #plt.plot(np.transpose(frequency1)) #By looking at the frequency of each word, find the threshold #400, frequency > 500 are dominant words do = pd.Series(frequency1.getA()[0],index = featurename1) freq_sort1 = do.sort_values(ascending=False) c1 = freq_sort1[:20].index