def q133(): def split_into_lemmas(message): message = message.lower() words = TextBlob(message).words # for each word, take its "base form" = lemma return [word.lemma for word in words] bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(data['text']) rare_feats = bow_transformer.get_feature_names() zacs_bow=CountVectorizer(ngram_range=(1, 2)).fit(data['text']) zac_bow_words=zacs_bow.get_feature_names() intersection = set(rare_feats).intersection(zac_bow_words) only_in_zac=list(set(zac_bow_words)-set(intersection)) only_in_rare=list(set(rare_feats)-set(intersection)) len_o_z = len(only_in_zac) len_o_r = len(only_in_rare) sum_o = len_o_z + len_o_r per_z = len_o_z/sum_o per_r = len_o_r/sum_o print("There are %d features in common, between both groups." %len(intersection)) print("There are %d unique features in Zac's model." % len_o_z) print("There are %d unique features in Zac's model." %len_o_r) print("Together, both model's have %d features. " %sum_o ) print("Clearly, Zac's model holds %f of all features. " % per_z) print("Clearly, Rare's model holds %f of all features. " % per_r) print("Zac's model is much larger, on account of using bigrams and not stemming. ")
def find_significant_terms(corpus): """ find words that are more common in one document than in the whole corpus # {(word, count)} per list -> and per corpus # {(word, freq)} per list = count.list / count.corpus :param corpus: [clinton_text, trump_text] :return: """ vectorizer = CountVectorizer(min_df=1) list_counts = np.array(vectorizer.fit_transform(corpus).toarray()) # print list_counts corpus_counts = np.sum(x for x in list_counts) # print corpus_counts list_freq = [1.0 * x / corpus_counts for x in list_counts] # print map(lambda x: x.tolist(), list_freq) sorted_by_freq = [list(reversed(sorted(zip(x.tolist(), vectorizer.get_feature_names())))) for x in list_freq] sorted_by_count = [list(reversed(sorted(zip(x.tolist(), vectorizer.get_feature_names())))) for x in list_counts] return sorted_by_freq, sorted_by_count # print find_significant_terms(['ala ma kota', 'ala ma psa'])
def TFIDF(): global segcont global weight vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(segcont)) word = vectorizer.get_feature_names() # 所有文本的关键字 weight = tfidf.toarray() # 对应的tfidf矩阵 del segcont seg = [] for i in range(len(weight)): enstr = "" for j in range(len(word)): if weight[i][j] >= 0.1:##################################### enstr = enstr + " " + word[j] seg.append(enstr) del weight vec = CountVectorizer() tra = TfidfTransformer() tidf = tra.fit_transform(vec.fit_transform(seg)) wo = vec.get_feature_names() we = tidf.toarray() global we
def test_feature_names(): cv = CountVectorizer(max_df=0.5) # test for Value error on unfitted/empty vocabulary assert_raises(ValueError, cv.get_feature_names) assert_false(cv.fixed_vocabulary_) # test for vocabulary learned from data X = cv.fit_transform(ALL_FOOD_DOCS) n_samples, n_features = X.shape assert_equal(len(cv.vocabulary_), n_features) feature_names = cv.get_feature_names() assert_equal(len(feature_names), n_features) assert_array_equal(['beer', 'burger', 'celeri', 'coke', 'pizza', 'salad', 'sparkling', 'tomato', 'water'], feature_names) for idx, name in enumerate(feature_names): assert_equal(idx, cv.vocabulary_.get(name)) # test for custom vocabulary vocab = ['beer', 'burger', 'celeri', 'coke', 'pizza', 'salad', 'sparkling', 'tomato', 'water'] cv = CountVectorizer(vocabulary=vocab) feature_names = cv.get_feature_names() assert_array_equal(['beer', 'burger', 'celeri', 'coke', 'pizza', 'salad', 'sparkling', 'tomato', 'water'], feature_names) assert_true(cv.fixed_vocabulary_) for idx, name in enumerate(feature_names): assert_equal(idx, cv.vocabulary_.get(name))
def load_dataset(dataset): if dataset == ['imdb']: #(X_pool, y_pool, X_test, y_test) = load_data() #vect = CountVectorizer(min_df=0.005, max_df=1./3, binary=True, ngram_range=(1,1)) vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1,1)) X_pool, y_pool, X_test, y_test, _, _, = load_imdb(path='./aclImdb/', shuffle=True, vectorizer=vect) return (X_pool, y_pool, X_test, y_test, vect.get_feature_names()) elif isinstance(dataset, list) and len(dataset) == 3 and dataset[0] == '20newsgroups': vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1)) X_pool, y_pool, X_test, y_test, _, _ = \ load_newsgroups(class1=dataset[1], class2=dataset[2], vectorizer=vect) return (X_pool, y_pool, X_test, y_test, vect.get_feature_names()) elif dataset == ['SRAA']: X_pool = pickle.load(open('./SRAA_X_train.pickle', 'rb')) y_pool = pickle.load(open('./SRAA_y_train.pickle', 'rb')) X_test = pickle.load(open('./SRAA_X_test.pickle', 'rb')) y_test = pickle.load(open('./SRAA_y_test.pickle', 'rb')) feat_names = pickle.load(open('./SRAA_feature_names.pickle', 'rb')) return (X_pool, y_pool, X_test, y_test, feat_names) elif dataset == ['nova']: (X_pool, y_pool, X_test, y_test) = load_nova() return (X_pool, y_pool, X_test, y_test, None) elif dataset == ['ibnsina']: (X_pool, y_pool, X_test, y_test) = load_ibnsina() return (X_pool, y_pool, X_test, y_test, None) elif dataset == ['creditg']: (X_pool, y_pool, X_test, y_test) = load_creditg() return (X_pool, y_pool, X_test, y_test, None)
def wordMoverDistance(d1, d2): ###d1 list ###d2 list # Rule out words that not in vocabulary d1 = " ".join([w for w in d1 if w in vocab_dict]) d2 = " ".join([w for w in d2 if w in vocab_dict]) #print d1 #print d2 vect = CountVectorizer().fit([d1,d2]) feature_names = vect.get_feature_names() W_ = W[[vocab_dict[w] for w in vect.get_feature_names()]] #Word Matrix D_ = euclidean_distances(W_) # Distance Matrix D_ = D_.astype(np.double) #D_ /= D_.max() # Normalize for comparison v_1, v_2 = vect.transform([d1, d2]) v_1 = v_1.toarray().ravel() v_2 = v_2.toarray().ravel() ### EMD v_1 = v_1.astype(np.double) v_2 = v_2.astype(np.double) v_1 /= v_1.sum() v_2 /= v_2.sum() #print("d(doc_1, doc_2) = {:.2f}".format(emd(v_1, v_2, D_))) emd_d = emd(v_1, v_2, D_) ## WMD #print emd_d return emd_d
def improveVocabulary(positiveDocuments, negativeDocuments): countVectPos = CountVectorizer(min_df = 0.1, stop_words = 'english') countVectNeg = CountVectorizer(min_df = 0.1, stop_words = 'english') positiveCandidates = [] negativeCandidates = [] if len(positiveDocuments) > 0: try: countVectPos.fit_transform(positiveDocuments) positiveCandidates = countVectPos.get_feature_names() except: a = 1 #print "count vector failed" if len(negativeDocuments) > 0: try: countVectNeg.fit_transform(negativeDocuments) negativeCandidates = countVectNeg.get_feature_names() except: a = 1 #print "countvector failed" global listPos, listNeg, countDictPos, countDictNeg #pdb.set_trace() for candidate in (positiveCandidates + negativeCandidates): score = (getMapOutput(countVectPos.vocabulary_, candidate) - getMapOutput(countVectNeg.vocabulary_, candidate)) if (score > 0 and score/getMapOutput(countVectPos.vocabulary_, candidate) >= 0.1): insertMap(listPos, candidate) elif (score < 0 and abs(score)/getMapOutput(countVectNeg.vocabulary_, candidate) >= 0.1): insertMap(listNeg, candidate) '''
def number_of_words_in_common(s1, s2): vec = CountVectorizer() counts = vec.fit_transform([s1, s2]).toarray() res = counts[0]*counts[1] i = 0 common_words = [] for r in range(len(res)): if res[r] !=0: i+=1 vec.get_feature_names()[r] return i,common_words
def vectorizerDataBigram(data, min_freq): vectorizer = CountVectorizer(analyzer='word', min_df = min_freq, lowercase=True, stop_words='english',token_pattern='(?u)\\b\\w\\w+\\b', binary=True, ngram_range=(2,2)) X = vectorizer.fit_transform(data) df = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names()) regex = re.compile(r'\d') new_list = [s for s in vectorizer.get_feature_names() if not regex.match(s)] return pd.DataFrame(df, columns = new_list)
def Get_unigrams_bigrams(corpus): # import countvectorzier to generate unigrams and bigrams unicount_vect = CountVectorizer(ngram_range=(1,1), lowercase = False, stop_words='english', token_pattern=r'\b\w+\b', min_df=1) unicount = unicount_vect.fit_transform(corpus).toarray() unigrams = unicount_vect.get_feature_names() bicount_vect = CountVectorizer(ngram_range=(2,2), lowercase = False, stop_words='english', token_pattern=r'\b\w+\b', min_df=1) bicount = bicount_vect.fit_transform(corpus).toarray() bigrams = bicount_vect.get_feature_names() return (unigrams, bigrams)
class NearestNeighborMethod(object): def __init__(self,n_results=1,ngram_range=(1,1),tokenizer=SynonymTokenizer()): from sklearn.feature_extraction.text import CountVectorizer from sklearn.neighbors import NearestNeighbors self.countvec = CountVectorizer(ngram_range=ngram_range,analyzer='word',lowercase=True,\ token_pattern='[a-zA-Z0-9]+',strip_accents='unicode',tokenizer=tokenizer) self.nbrs = NearestNeighbors(n_neighbors=n_results) def load_ref_text(self,text_file): import re,nltk.data from nltk.corpus import wordnet as wn from nltk.stem import WordNetLemmatizer textfile = open(text_file,'r') lines=textfile.readlines() textfile.close() sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sentences = [ sent_tokenizer.tokenize(line.strip()) for line in lines] sentences1 = [item for sublist in sentences for item in sublist] chk2=pd.DataFrame(self.countvec.fit_transform(sentences1).toarray(),columns=self.countvec.get_feature_names()) chk2[chk2>1]=1 return chk2,sentences1 #text1 = [] #for sent in text: # new_words = [] # for word,word_type in sent: # synonyms = list({l.name().lower() for s in wn.synsets(word) for l in s.lemmas()}) # new_words.append((synonyms,word_type)) # text1.append(new_words) def load_query(self,text): #print text chk2=pd.DataFrame(self.countvec.transform([text]).toarray(),columns=self.countvec.get_feature_names()) #print chk2.shape chk2[chk2>1]=1 return chk2 def get_scores(self,ref_dataframe,ref_query,n_results=1): self.nbrs.fit(ref_dataframe) return self.nbrs.kneighbors(ref_query) def get_results(self,query): ref_dataframe,ref_sentences=NearestNeighborMethod.load_ref_text(self,'india.txt') #print ref_dataframe.shape,len(ref_sentences) ref_query=NearestNeighborMethod.load_query(self,query) neighbors_index = NearestNeighborMethod.get_scores(self,ref_dataframe,ref_query)[1] #print type(neighbors_index) #print neighbors_index[0] neighbors = list( ref_sentences[i] for i in neighbors_index[0] ) print neighbors
def addBagOfWordsFeature(wordproblems): vectorizer = CountVectorizer( analyzer="word", tokenizer=LemmaTokenizer(), preprocessor=None, stop_words=None, max_features=5000 ) train_data_features = vectorizer.fit_transform(wordproblems) train_data_features = train_data_features.toarray() vocab = vectorizer.get_feature_names() vocab_wo_nums = [] for s in vocab: if not any(char.isdigit() for char in s): vocab_wo_nums.append(s) vectorizer = CountVectorizer( analyzer="word", tokenizer=LemmaTokenizer(), preprocessor=None, stop_words=None, max_features=5000, vocabulary=vocab_wo_nums, ) train_data_features = vectorizer.fit_transform(wordproblems) train_data_features = train_data_features.toarray() vocab = vectorizer.get_feature_names() with open("data/vocab.txt", "w") as f: f.write(str(vocab_wo_nums)) numofnums = [] numofques = [] numofpercent = [] for i in range(0, len(train_data_features)): nums = numberOfNumbers(None, wordproblems[i]) numofnums.append(nums) ques = numberOfQuestions(wordproblems[i]) numofques.append(ques) perc = numberOfPercent(wordproblems[i]) numofpercent.append(perc) numofnums = numpy.array(numofnums) numofques = numpy.array(numofques) numofpercent = numpy.array(numofpercent) train_data_features = numpy.hstack((train_data_features, numpy.atleast_2d(numofnums).T)) train_data_features = numpy.hstack((train_data_features, numpy.atleast_2d(numofques).T)) train_data_features = numpy.hstack((train_data_features, numpy.atleast_2d(numofpercent).T)) # print train_data_features return (vectorizer, train_data_features)
def tfidf(corpus, word_category, file_to_write): vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) weight = tfidf.toarray() sum_weight = np.sum(weight, axis=0) word = vectorizer.get_feature_names() word_and_weight = [] for i in range(len(sum_weight)): word_and_weight.append([word[i], sum_weight[i]]) word_and_weight.sort(key=lambda key: key[1], reverse=True) f = open(file_to_write, "w+") result = [] for j in range(len(word_and_weight)): try: f.write( word_and_weight[j][0] + " " + str(word_and_weight[j][1]) + " " + word_category[word_and_weight[j][0]] + "\n" ) result.append([word_and_weight[j][0], word_and_weight[j][1], word_category[word_and_weight[j][0]]]) except: continue f.close() return result
def find_common_words(all_words, num_most_frequent_words): vectorizer = CountVectorizer( stop_words=None, # 'english', max_features=num_most_frequent_words, binary=True) vectorizer.fit(all_words) return (vectorizer.vocabulary_, vectorizer.get_feature_names())
def getContextFeature(self): import time print 'start to get Context Feature' start = time.time() from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer #when we meet the large corpus, need to input an iteration! corpus = self.getIterText() #transfer the text into word frequency matrix vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus)) print 'get word' word=vectorizer.get_feature_names() print 'get weight' weight=tfidf print 'weight type:', type(weight) #print weight end = time.time() print 'total time: \t', end-start return weight,word
def getTfidfData(dataTrain, dataTest, dataHold): print dataTrain.target_names count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2) tfidf_transformer = TfidfTransformer(sublinear_tf=True) X_counts = count_vect.fit_transform(dataTrain.data) X_tfidf = tfidf_transformer.fit_transform(X_counts) print X_tfidf.shape Y_counts = count_vect.transform(dataTest.data) Y_tfidf = tfidf_transformer.transform(Y_counts) print Y_tfidf.shape H_counts = count_vect.transform(dataHold.data) H_tfidf = tfidf_transformer.transform(H_counts) print 'feature selection using chi square test', len(dataTrain.target) feature_names = count_vect.get_feature_names() ch2 = SelectKBest(chi2, k='all') X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target) Y_tfidf = ch2.transform(Y_tfidf) H_tfidf = ch2.transform(H_tfidf) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] if feature_names: feature_names = numpy.asarray(feature_names) print 'important features' print feature_names[:10] return X_tfidf, Y_tfidf, H_tfidf
def extract_text_features(train_data, test_data): """ Returns one types of training and test data features. 1) Term Frequency times Inverse Document Frequency (tf-idf): X_train_tfidf, X_test_tfidf Parameters ---------- train_data : List[str] Training data in list. Will only take 30000 reviews for efficiency purposes test_data : List[str] Test data in list Returns ------- Tuple(scipy.sparse.csr.csr_matrix,.., list) Returns X_train_tfidf, X_test_tfidf, vocab as a tuple. """ # set up a count vectorizer that removes english stopwords when building a term-doc matrix count_vect = CountVectorizer(stop_words=set(stopwords.words('english'))) # build the term frequency per document matrix from a random sublist of 30,000 documents train_counts = count_vect.fit_transform(random.sample(train_data, 30000)) test_counts = count_vect.transform(test_data) tfidf_transformer = TfidfTransformer() train_tfidf = tfidf_transformer.fit_transform(train_counts) test_tfidf = tfidf_transformer.transform(test_counts) vocab = count_vect.get_feature_names() return (train_tfidf, test_tfidf, vocab)
def get_data(dir): titles = [] titles_label = [] os.path.walk(dir, visit, [titles, titles_label]) # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool. vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings. titles_vocab_mat = vectorizer.fit_transform(titles) # Numpy arrays are easy to work with, so convert the result to an array #print vectorizer.vocabulary_ # a dict, the value is the index train_data_features = titles_vocab_mat.toarray() print train_data_features.shape # Take a look at the words in the vocabulary vocab = vectorizer.get_feature_names() print '/'.join(vocab) # Sum up the counts of each vocabulary word dist = np.sum(train_data_features, axis=0) total_words = 0 for i in train_data_features: #print sum(i) total_words += sum(i) print total_words weka(vocab, dist, train_data_features, total_words, titles_label)
def bag_of_words_to_list(lines,max_features): # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool # removing stopwords vectorizer = CountVectorizer( stop_words = 'english' ,max_features = max_features ) #TfidfVectorizer i need to check this print('>> Removing stopwords...') # lets remove stopwords lines = remove_stopwords(lines,2) print('>> Stemming...') # lets stem it lines =stemming(lines,3) print('>> Doing bag of words...') #lets do the bag of words bag_of_words = vectorizer.fit_transform(lines) #uncomment to visualize the words and how many times are used #printing_bow(bag_of_words,vectorizer) return(vectorizer.get_feature_names(),bag_of_words.toarray())
def produceLDATopics(): ''' Takes description of each game and uses sklearn's latent dirichlet allocation and count vectorizer to extract topics. :return: pandas data frame with topic weights for each game (rows) and topic (columns) ''' data_samples, gameNames = create_game_profile_df(game_path) tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(data_samples) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) topics = lda.fit_transform(tf) # for i in range(50): # gameTopics = [] # for j in range(len(topics[0])): # if topics[i,j] > 1.0/float(n_topics): # gameTopics.append(j) # print gameNames[i], gameTopics topicsByGame = pandas.DataFrame(topics) topicsByGame.index = gameNames print topicsByGame tf_feature_names = tf_vectorizer.get_feature_names() for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) return topicsByGame
def token_count_pac(pac_id, \ limit = 'ALL', \ ngram_range = (2,2), \ min_df = 5): conn = psql.connect("dbname='keyword-influence'") cursor = conn.cursor() cursor.execute("SELECT id, speaking \ FROM words \ WHERE id IN ( \ SELECT id \ FROM words \ WHERE bioguide_id IN( \ SELECT bioguide_id \ FROM pac_contrib as pc \ INNER JOIN congress as c \ ON pc.fec_candidate_id = c.fec_id \ WHERE pac_id = '"+ pac_id +"'));") sql_result = cursor.fetchall() counter = CountVectorizer(stop_words = corpus.stopwords.words('english'), \ ngram_range = ngram_range, \ min_df = min_df) chunks = map(lambda x: x[1], sql_result) counts = counter.fit_transform(chunks) vocab = counter.get_feature_names() vocab = dict(zip(range(len(vocab)),vocab)) return [counts, vocab]
def tfidf(fileList): segPath = sys.path[0] + '/seg_result' corpus = [] #存取文档的分词结果 for eachFile in fileList: fileName = segPath + '/' + eachFile f = open(fileName,'r+') content = f.read() corpus.append(content) vectorizer = CountVectorizer() # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值,同时会使用默认的中文停用词 tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵 word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语 weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 #创建tfidf文件夹,保存tf-idf的结果 tfidfFilePath = os.getcwd() + '/tfidfFile' if not os.path.exists(tfidfFilePath): os.mkdir(tfidfFilePath) for i in range(len(weight)): print u"--------Writing all the tf-idf in the", i, u" file into ", tfidfFilePath + '/' + str(i) + '.txt', "--------" name = tfidfFilePath + '/' + string.zfill(i, 5) + '.txt' f = open(name,'w+') for j in range(len(word)): #f.write(word[j] + " " + str(weight[i][j]) + "\n") #f.write(str(weight[i][j]) + "\n") f.write(word[j] + "\n") f.close()
def textExtraction(df, series): vectorizer = CountVectorizer(analyzer = text_process, min_df = 0.1) df[series] = df[series].replace(np.nan, '', regex=True) vectorizer.fit_transform(df[series]) vocab = vectorizer.get_feature_names() return vocab
def getCount(artName): artLst = [] #artDict = {} for fn in os.listdir(indir): if not fn.endswith('.xml'): continue if ':' in fn: fn = fn.replace(':','/') fn = fn.decode('utf-8') #fn = unicodedata.normalize("NFC",fn) fn_de = unidecode(fn) newfn = fn_de[:-4] #print 'artName: ',artName, 'eval: ', newfn newfn = newfn.lower() if newfn == artName: # print "found article begin processing" #print fn if '/' in fn: fn = fn.replace('/',':') fullname = os.path.join(indir, fn) tree = ET.parse(fullname) root = tree.getroot() page = root.find('{http://www.mediawiki.org/xml/export-0.7/}page') revisions = page.findall('{http://www.mediawiki.org/xml/export-0.7/}revision') for s in revisions: txt = s.find('{http://www.mediawiki.org/xml/export-0.7/}text') artLst.append(txt.text) artLst = filter(None,[one for one in artLst]) # print "processing done; begin counting" vectorizer = CountVectorizer(min_df=1,token_pattern='([^\[\|\]\s\.\!\=\{\}\;\<\>\?\"\'\#\(\)\,\*]+)') X = vectorizer.fit_transform(artLst) artDict = dict(zip(vectorizer.get_feature_names(),np.asarray(X.sum(axis=0)).ravel())) return artDict return -1
def vectorize_substances(training, testing): substances = training.substances.apply(lambda x: re.sub(r'\(|\)|,','',x)) substances_test = testing.substances.apply(lambda x: re.sub(r'\(|\)|,','',x)) vec = CountVectorizer(strip_accents="unicode", analyzer="char_wb", ngram_range=(3,3), binary=True) x = vec.fit_transform(substances) xtest = vec.transform(substances_test) return x, xtest, vec.get_feature_names()
def add_issue_columns(messages): from sklearn.feature_extraction.text import CountVectorizer v = CountVectorizer(binary=True) issue_matrix = v.fit_transform([str(x) for x in messages['Q13_issues']]).toarray() issues = v.get_feature_names() for (i, issue) in enumerate(issues): messages[issue] = pd.Series(issue_matrix[:,i])
def count_fin_words(lmd,dataset): #Modifying the Dictionary lmd=lmd[['Word','Positive','Negative']] lmd['Sum']=lmd['Positive']+lmd['Negative'] lmd=lmd[lmd.Sum != 0] lmd=lmd.drop(['Sum'],axis=1) lmd.loc[lmd['Positive']>0, 'Positive'] = 1 lmd.loc[lmd['Negative']>0, 'Negative'] = -1 lmd['Word']=lmd['Word'].str.lower() #Counting the words in the MDA tf = CountVectorizer(analyzer='word', min_df = 0, stop_words = 'english') tfidf_matrix = tf.fit_transform(dataset['MDA_Text'].values) feature_names = tf.get_feature_names() tfidf_array = tfidf_matrix.toarray() tfidf_df = pd.DataFrame(tfidf_array) tfidf_df.columns = [i.lower() for i in feature_names] tfidf_df = tfidf_df.T tfidf_df['Word']=tfidf_df.index #Merging the results result_df = pd.merge(tfidf_df, lmd, how='inner',left_on='Word',right_on='Word') col_list=list(result_df) result_df_pos=result_df[result_df.Positive==1] result_df_neg=result_df[result_df.Negative==-1] result_df[col_list[0:len(dataset)]].sum(axis=0) #Counting the positive and negative words in a financial context per document pos_words_sum=result_df_pos[col_list[0:len(dataset)]].sum(axis=0) neg_words_sum=result_df_neg[col_list[0:len(dataset)]].sum(axis=0) #Adding new features to the master dataframe dataset['Tot_pos']=pos_words_sum.values dataset['Tot_neg']=neg_words_sum.values return dataset
def createDTMat(fileList): from sklearn.feature_extraction.text import CountVectorizer cvec = CountVectorizer(stop_words = 'english') lines_list = readInList(fileList) X = cvec.fit_transform(lines_list).toarray() vocab = cvec.get_feature_names() return (X, vocab)
def tf_idf(seg_files): seg_path = './segfile/' corpus = [] for file in seg_files: fname = seg_path + file f = open(fname, 'r+') content = f.read() f.close() corpus.append(content) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfdif = transformer.fit_transform(vectorizer.fit_transform(corpus)) word = vectorizer.get_feature_names() weight = tfdif.toarray() save_path = './tfidffile' if not os._exists(save_path): os.mkdir(save_path) for i in range(len(weight)): print('--------Writing all the tf-idf in the', i, u' file into ', save_path + '/' + string.zfill(i, 5) + '.txt', '--------') f = open(save_path + '/' + string.zfill(i, 5) + '.txt', 'w+') for j in range(len(word)): f.write(word[j] + ' ' + str(weight[i][j]) + '\r\n') f.close()
def text_feature(data,text_var,nfeature,noun=False,silence=False): """Calculate the text features for the given data. text_var specifies the name of the column that contains the text. nfeature specifies the max number of features to be extracted from the text.""" # First clean and parse the text data clean_statuses = [] nitem = data.shape[0] data.index=range(nitem) for i in xrange( 0, nitem): if (i+1)%1000 == 0 and not silence: print "Status %d of %d\n" % ( i+1, nitem) clean_statuses.append( status_to_words(data[text_var][i],noun)) # Then extract features from the cleaned text print "Creating the bag of words...\n" vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = nfeature) data_features = vectorizer.fit_transform(clean_statuses) data_features = data_features.toarray() vocab = vectorizer.get_feature_names() # Sum up the counts of each vocabulary word counts = np.sum(data_features, axis=0) return {'features':data_features,'word':vocab,'counts':counts}
#!/usr/bin/python # -*- coding: UTF-8 -*- from sklearn.feature_extraction.text import CountVectorizer import numpy as np from sys import argv import pandas as pd from sklearn.feature_extraction import text #my_stop_words = list(pd.read_csv('stopwords.txt',header=None)[0]) text = list(pd.read_csv(argv[1], sep='\t', header=None)[0].astype('U')) vectorizer = CountVectorizer(encoding='utf-8', stop_words='english', lowercase=True) vectorizer.fit_transform(text) vector = vectorizer.transform(text) wl = vectorizer.get_feature_names() np.savetxt(argv[2], wl, delimiter=",", fmt='%s') np.savetxt(argv[3], vector.toarray(), delimiter=",", fmt='%s')
import os import sys from sklearn.feature_extraction.text import CountVectorizer if "-" in sys.argv: lines = sys.stdin.readlines() sources = ['stdin'] else: sources = ([arg for arg in sys.argv[1:] if os.path.exists(arg)] or ["constitution.txt"]) lines = [] for s in sources: with open(s) as f: lines.extend(f.readlines()) text = "".join(lines) cv = CountVectorizer(min_df=1, charset_error="ignore", stop_words="english", max_features=200) counts = cv.fit_transform([text]).toarray().ravel() words = np.array(cv.get_feature_names()) # throw away some words, normalize words = words[counts > 1] counts = counts[counts > 1] output_filename = (os.path.splitext(os.path.basename(sources[0]))[0] + "_.png") print(output_filename) counts = make_wordcloud(words, counts, output_filename)
from naive_bayes import SentiLexiconNB1 from utils import ReviewPreprocessing dataset = pd.read_csv('assets/Restaurant_Reviews.tsv', delimiter='\t') preprocessing = ReviewPreprocessing().fit(dataset['Review']) corpus = preprocessing.corpus senti_lexicon = preprocessing.senti_lexicon vectorizing_regex = r"[-_'a-zA-ZÀ-ÖØ-öø-ÿ0-9]+" vectorizer = CountVectorizer(ngram_range=(1, 2), analyzer='word', token_pattern=vectorizing_regex) classifier = SentiLexiconNB1(senti_lexicon) X = vectorizer.fit_transform(corpus) y = dataset.iloc[:, 1].values classifier.fit(X, y, vectorizer.get_feature_names()) while True: review = input('Add your review: ') preprocessed_review = preprocessing.transform([review]) X = vectorizer.transform(preprocessed_review) sentiment = 'Positive' if classifier.predict(X)[0] else 'Negative' print(sentiment)
def most_common_words_by_group( self, X, text_col_name, group_col_name, num_examples, num_times_min, min_ngram, ): """ Get the most commons phrases for defined groups. Parameters -------- X: DataFrame text_col_name: str group_col_name: str num_examples: int Number of text examples to include per group num_times_min: int Minimum number of times word/phrase must appear in texts min_ngram: int Returns -------- overall_counts_df: DataFrame Has groups, top words, and counts """ # Fix for when column name is the same as an ngram column name X["group_column"] = X[group_col_name] # Remove all other unneeded columns X = X[[text_col_name, "group_column"]] all_stop_words = ( set(ENGLISH_STOP_WORDS) | set(["-PRON-"]) | set(string.punctuation) | set([" "]) ) cv = CountVectorizer( stop_words=all_stop_words, ngram_range=(min_ngram, 3), min_df=num_times_min, max_df=0.4, ) vectors = cv.fit_transform(X[text_col_name]).todense() words = cv.get_feature_names() vectors_df = pd.DataFrame(vectors, columns=words) group_plus_vectors = pd.concat([vectors_df, X.reset_index(drop=False)], axis=1) count_words = pd.DataFrame( group_plus_vectors.groupby("group_column").count()["index"] ) count_words = count_words.loc[:, ~count_words.columns.duplicated()] # Fix for when "count" is an ngram column count_words.columns = ["count_ngrams"] group_plus_vectors = group_plus_vectors.merge( count_words, on="group_column", how="left" ) group_plus_vectors["count_ngrams"].fillna(0, inplace=True) sums_by_col = ( group_plus_vectors[ group_plus_vectors.columns[ ~group_plus_vectors.columns.isin([text_col_name, "index",]) ] ] .groupby("group_column") .sum() ) sums_by_col.sort_values(by="count_ngrams", ascending=False, inplace=True) sums_by_col.drop("count_ngrams", axis=1, inplace=True) array_sums = np.array(sums_by_col) sums_values_descending = -np.sort(-array_sums, axis=1) sums_indices_descending = (-array_sums).argsort() highest_sum = pd.DataFrame(sums_values_descending[:, 0]) highest_sum.columns = ["highest_sum"] sums_by_col["highest_sum"] = highest_sum["highest_sum"].values overall_counts_df = pd.DataFrame(columns=["group_name", "top_words_and_counts"]) i = 0 for row in sums_by_col.index: dict_scores = {} temp_df = pd.DataFrame(columns=["group_name", "top_words_and_counts"]) temp_df["group_name"] = [row] top_columns = sums_by_col.columns[ sums_indices_descending[i][:num_examples] ].values top_counts = sums_values_descending[i][:num_examples] [dict_scores.update({x: y}) for x, y in zip(top_columns, top_counts)] temp_df["top_words_and_counts"] = [dict_scores] overall_counts_df = overall_counts_df.append([temp_df]) print(f"Group Name: {row}\n") for k, v in dict_scores.items(): print(k, v) print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n") i += 1 return overall_counts_df
y=msg.labelnum #splitting the dataset into train and test data from sklearn.model_selection import train_test_split xtrain,xtest,ytrain,ytest=train_test_split(X,y) print(xtest.shape) print(xtrain.shape) print(ytest.shape) print(ytrain.shape) print("train data") print(xtrain) #output of count vectoriser is a sparse matrix from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() xtrain_dtm = count_vect.fit_transform(xtrain) xtest_dtm=count_vect.transform(xtest) print(count_vect.get_feature_names()) df=pd.DataFrame(xtrain_dtm.toarray(),columns=count_vect.get_feature_names()) print(df)#tabular representation print(xtrain_dtm) #sparse matrix representation # Training Naive Bayes (NB) classifier on training data. from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB().fit(xtrain_dtm,ytrain) predicted = clf.predict(xtest_dtm) #printing accuracy metrics from sklearn import metrics print('Accuracy metrics') print('Accuracy of the classifer is',metrics.accuracy_score(ytest,predicted)) print('Confusion matrix') print(metrics.confusion_matrix(ytest,predicted)) print('Recall and Precison ') print(metrics.recall_score(ytest,predicted))
for i in xrange(0, num): #if( (i+1)%1000 == 0 ): print "Review %d of %d\n" % (i + 1, num) perfect_words.append(cleaning_words(df["review"][i])) #print df.head(67) #print df["Postive rated"].mean() X_train, X_test, y_train, y_test = train_test_split(perfect_words, df['Postive rated'], random_state=0) #print X_train[10] #print X_train.shape #print df['Postive rated'] vect = CountVectorizer(min_df=5, ngram_range=(1, 2)).fit(X_train) print len(vect.get_feature_names()) X_train_vetorised = vect.transform(X_train) #print X_train_vetorised print "starting training!!!!!" model = LogisticRegression() print "Stage 1 is completed" model.fit(X_train_vetorised, y_train) print "Stage 2 is completed" predictions = model.predict(vect.transform(X_test)) print "Stage 3 is completed" print("AUC:", roc_auc_score(y_test, predictions)) feature_name = np.array(vect.get_feature_names()) sort_coeff = model.coef_[0].argsort()
# print(line.replace('\xa0',''[:100]).strip()) segline = segment(line.strip().replace('\xa0','')) # print(segline) doc = doc+ segline+' ' corpus.append(doc) # print(docs) # break stoplist = ['了','与','他', '我'] vectorizer=CountVectorizer( token_pattern='[\u4e00-\u9fa5]+',stop_words=stoplist,ngram_range=(1, 1),min_df=10)#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值 # print(corpus) tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵 print('--------tfidf---------') # print(tfidf) word=vectorizer.get_feature_names()#获取词袋模型中的所有词语 print('--------word---------') print(word) print(len(word)) weight=tfidf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 # for i in range(len(weight)):#打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重 # print ('-------这里输出第',i,'篇文本的词语tf-idf权重------') # for j in range(len(word)): # print(word[j],weight[i][j]) def demo(): corpus=["我 来到 北京 清华大学",#第一类文本切词后的结果,词之间以空格隔开 "他 来到 了 网易 杭研 大厦",#第二类文本的切词结果 "小明 硕士 毕业 与 中国 科学院",#第三类文本的切词结果
class TextTopics(): """ Text classifier. """ def __init__(self, df: pd.DataFrame, number_topics=50, instance_path=instance_path(), **kwargs): self._instance_path = instance_path self.number_topics = number_topics self.stop_words: List = get_stop_words("fi") self._count_vector: CountVectorizer = None self._lda: LDA = None self.token_cache = {} self._tokenizer = None self.min_sentence_length = 17 # `kk` is used in assocation with time periods. self.stop_words += ["kk", "voi", "yms", "mm"] self.init(df, kwargs) def init(self, df: pd.DataFrame, generate_visualization=False, lang="fi"): """ :param df: :class:`~pandas.Dataframe` containing text colums :param generate_visualization: Generate visalization of LDA results. Slows down generation notably. :param lang: Language for :class:`~Voikko` """ if self._count_vector and self._lda: return True file_words = self.instance_path() / "word.dat" file_lda = self.instance_path() / "lda.dat" file_ldavis = self.instance_path() / "ldavis.html" try: # Try loading saved lda files. self._count_vector = joblib.load(file_words) self._lda = joblib.load(file_lda) except FileNotFoundError as e: logger.exception(e) texts = [x for x in df.to_numpy().flatten() if x is not np.NaN] # Setup word count vector self._count_vector = CountVectorizer(tokenizer=self.text_tokenize, stop_words=self.stop_words) count_data = self._count_vector.fit_transform(texts) self._lda = LDA(n_components=self.number_topics, n_jobs=-1) self._lda.fit(count_data) if generate_visualization: logger.debug( "Generating LDA visualization. This might take a while") from pyLDAvis import sklearn as sklearn_lda import pyLDAvis LDAvis_prepared = sklearn_lda.prepare(self._lda, count_data, self._count_vector) pyLDAvis.save_html(LDAvis_prepared, str(file_ldavis)) joblib.dump(self._count_vector, file_words) joblib.dump(self._lda, file_lda) def instance_path(self): path = self._instance_path / "lda" / str(self.number_topics) path.mkdir(exist_ok=True, parents=True) return path def tokenizer(self): if not self._tokenizer: self._tokenizer = VoikkoTokenizer("fi") return self._tokenizer @cached(LRUCache(maxsize=1024)) def text_tokenize(self, text): """ Cached wrapper for `VoikkoTokenizer.tokenize()` """ return self.tokenizer().tokenize(text) def find_talkingpoint(self, candidate: pd.Series) -> str: """ Find most suitable sentence from text """ texts = tuple(candidate.dropna()) if len(texts) == 0: return None x = self._get_topics(texts) return self.nearest_sentence(x[1], texts) def nearest_sentence(self, topics: List[float], texts: List[str]) -> str: """ Find sentence closest to topic. TODO: When joining multiple sentences, it should be checked that they are from same paragraph. """ @cached(LFUCache(maxsize=128)) def lda(sentences): count_data = self._count_vector.transform(sentences) _lda = self._lda.transform(count_data) return _lda # Tokenize into sentences. sentences = chain(*[ re.findall(r"\s*(.+?[\.!?])+", b, re.MULTILINE + re.DOTALL) for b in texts if b.strip() != "" ]) # cleanup sentences. sentences = tuple( set( filter(lambda x: len(x) > self.min_sentence_length, map(str.strip, sentences)))) if len(sentences) == 0: return None # Find most topical sentence. tl_dr = [] distance = 1. prev_sentence = "" for current_sentence, m in zip(sentences, lda(sentences)): _distance = np.abs(np.mean(topics - m)) if _distance < distance: tl_dr, distance = ([prev_sentence, current_sentence], _distance) # Previous sentence is to provide context to most suitable sentence. prev_sentence = current_sentence return " ".join(filter(None, tl_dr)) def compare_series(self, source: pd.Series, target: pd.Series): """ Compare two text sets. First tuple contains topic word not found in :param:`target`, and second tuple contains word not found in :param:`source`. Note: This result will not be cached. Use :method:`compare_rows()` if possible. """ # Convert them into tuples, so they can be cached. _source = tuple(source.dropna()) _target = tuple(target.dropna()) return self.compare_count_data(*self._get_topics(_source), *self._get_topics(_target)) def compare_rows(self, df: pd.DataFrame, i, l): x = self.row_topics(df, i) y = self.row_topics(df, l) if not x or not y: return None r = self.compare_count_data(*x, *y) return r def row_topics(self, df: pd.DataFrame, idx): """ Return suitable topics from dataset `df` row :param:`idx` """ x = tuple(df.loc[idx].dropna()) if len(x) == 0: return None return self._get_topics(x) @cached(LRUCache(maxsize=512)) def _get_topics(self, source: List) -> Tuple: count_data = self._count_vector.transform(source) return (count_data, self._lda.transform(count_data).mean(axis=0)) def compare_count_data( self, counts_data_source, topics_source, counts_data_target, topics_target) -> Tuple[Tuple[str, int], Tuple[str, int]]: diffs = topics_source - topics_target topic_max = np.argmax(diffs) topic_min = np.argmin(diffs) source_words = self.suggest_topic_word(counts_data_source, counts_data_target, topic_max) target_words = self.suggest_topic_word(counts_data_target, counts_data_source, topic_min) word_for_source = self.suitable_topic_word(source_words) if len( source_words) else None word_for_target = self.suitable_topic_word(target_words) if len( target_words) else None return TopicComparision(source=Topic(id=topic_max, term=word_for_source), target=Topic(id=topic_min, term=word_for_target)) def suggest_topic_word(self, A, B, topic_id: int) -> List[Tuple[int, float]]: """ Find relevant word for topic. Copares :param:`A` and :param:`B` words, and topic words to find suitable word with enough difference between `A` and `B`. :param A: :class:`csr_matrix` Target to find word for. :param B: :class:`csr_matrix` Comparative target for `A` :param topic_id: lda topic id number. :return: List of tuples in prominen order. First instance in tuple is word vector feature number, and second is prominence value. """ # Generate sum of used words a_sum = A.toarray().sum(0) b_sum = B.toarray().sum(0) # Topic word, prefering unique ones. λ = self._lda.components_[topic_id] / self._lda.components_.sum(0) # Remove words from A that B has used too. # Note: Doesn't actually remove. complement = a_sum - b_sum # Use logarithm, so topic words are prefered. prominence = np.log(complement) * λ # Generate list of words, ordered by prominence r = sorted( [(i, prominence[i]) for i in prominence.argsort() if prominence[i] != 0 > -np.inf], key=lambda x: x[1], reverse=True) return r # sequence list is too volatile to be cached. def suitable_topic_word(self, seq: List[List[int, ]]) -> str: """ Find first suitable word from :param:`seq` list. :param: 1d matrix of word feature indexes. Only first column in row is interepted as feature number. """ vector_words = self.vector_words() """ Find first suitable word from word list """ for r in seq: word = vector_words[r[0]] if self._suitable_topic_word(word): return word return None @cached(LFUCache(maxsize=512)) def _suitable_topic_word(self, word) -> bool: """ Check if word can be used as topic word Accepted word classes: :nimi: Names; Words like `Linux` and `Microsoft`, `Kokoomus` :nimisana: Substantives; like `ihminen`, `maahanmuutto`, `koulutus`, `Kokoomus` :laatusana: Adjectives; words like `maksuton` :nimisana_laatusana: Adjectives, that are not "real", like `rohkea` or `liberaali` :lyhenne: Abbrevations; Words like `EU` :paikannimi:Geographical locations, like `Helsinki` :sukunimi: Last names, like `Kekkonen` """ for morph in self.tokenizer().analyze(word): _class = morph.get("CLASS") if _class in [ "nimi", "nimisana", "nimisana_laatusana", "lyhenne", "paikannimi", "sukunimi" ]: return True else: logger.debug("Unsuitable word class %s for word %s", _class, word) return False def vector_words(self) -> List: """ Feature names in CountVector """ return self._count_vector.get_feature_names()
text=[] for i in range(0,16002): texte = word_tokenize(dataset['Review'][i]) text.append(texte) text[i]=nltk.pos_tag(text[i]) count=[] for i in range(0,16002): words, tags=zip(*text[i]) count.append(tags) count[i]=' '.join(count[i]) from sklearn.feature_extraction.text import CountVectorizer tup=CountVectorizer() X2=tup.fit_transform(count).toarray() tup.get_feature_names() #BIGRAMS from nltk.util import ngrams from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer(ngram_range=(2,2),max_features=100) X7 = count_vect.fit_transform(corpus).toarray() count_vect.get_feature_names() #UNIGRAMS from sklearn.feature_extraction.text import CountVectorizer cv=CountVectorizer(max_features=100) X1=cv.fit_transform(corpus).toarray() cv.get_feature_names() #DEPENDENT VARIABLE