def f_ngram_pos(train, test, mode='tfidf', binary=1, ngram=(1, 1), min_c=1): '''exact n-gram feacturs based on word property information return: feature array (fgram); feature vocabulary (vocab) input: raw train and test data; ngram = (n,n), denote n_gram; tokens with count below min_c are cut off. ''' if mode == 'tfidf': if binary == 1: gram = text.TfidfVectorizer(ngram_range=ngram, binary=True, min_df=min_c) else: gram = text.TfidfVectorizer(ngram_range=ngram, min_df=min_c) else: # mode=count if binary == 1: gram = text.CountVectorizer(ngram_range=ngram, binary=True, min_df=min_c) else: gram = text.CountVectorizer(ngram_range=ngram, min_df=min_c) train = dataFilter_train(train) train_pos = data_Pos(train) gram.fit(train_pos) vocab = gram.get_feature_names() fgram_train = gram.transform(train_pos).toarray() test = dataFilter_train(test) test_pos = data_Pos(test) fgram_test = gram.transform(test_pos).toarray() return (fgram_train, fgram_test, vocab)
def f_ngram(data, mode='tfidf', binary=1, ngram=(1, 1), min_c=1): '''exact n-gram feacturs return: feature array (fgram); feature vocabulary (vocab) input: data; ngram = (n,n) denote n_gram and ngram=(1,2) denote 1_gram and 2_gram; tokens with count below min_c are cut off. ''' if mode == 'tfidf': if binary == 1: gram = text.TfidfVectorizer(ngram_range=ngram, binary=True, min_df=min_c) else: gram = text.TfidfVectorizer(ngram_range=ngram, min_df=min_c) else: #mode=count if binary == 1: gram = text.CountVectorizer(ngram_range=ngram, binary=True, min_df=min_c) else: gram = text.CountVectorizer(ngram_range=ngram, min_df=min_c) gram = gram.fit(data) vocab = gram.get_feature_names() fgram = gram.transform(data).toarray() return (fgram, vocab)
def extract_features(features_train, features_test, post_train, post_test, cap_train, cap_test, title_train, title_test, low_train, low_test, digit_tr, digit_test, con_train, con_test, back_train, back_test, front_train, front_test): vectorizer_1 = text.CountVectorizer(ngram_range=(1, 1)) vectorizer_2 = text.CountVectorizer(ngram_range=(1, 1)) vectorizer_3 = text.CountVectorizer(ngram_range=(1, 1)) vectorizer_4 = text.CountVectorizer(ngram_range=(1, 1)) vectorizer_5 = text.CountVectorizer(ngram_range=(1, 1)) training_vector = vectorizer_1.fit_transform(features_train) test_vector = vectorizer_1.transform(features_test) ptraining_vector = vectorizer_2.fit_transform(post_train) ptest_vector = vectorizer_2.transform(post_test) context_train = vectorizer_3.fit_transform(con_train) context_test = vectorizer_3.transform(con_test) bck_train = vectorizer_4.fit_transform(back_train) bck_test = vectorizer_4.transform(back_test) frnt_train = vectorizer_5.fit_transform(front_train) frnt_test = vectorizer_5.transform(front_test) training_vec = sp.sparse.hstack( (training_vector, context_train, bck_train, frnt_train, ptraining_vector, csr_matrix(cap_train).T, csr_matrix(title_train).T, csr_matrix(low_train).T, csr_matrix(digit_tr).T)) test_vect = sp.sparse.hstack( (test_vector, context_test, bck_test, frnt_test, ptest_vector, csr_matrix(cap_test).T, csr_matrix(title_test).T, csr_matrix(low_test).T, csr_matrix(digit_test).T)) return training_vec, test_vect
def bag_of_words(tr_tweets, te_tweets, tr_targets=pd.Series(), te_targets=pd.Series(), per_target=False, max_feats=None, normalise_counts=False, **kwargs): """ Calculate bag-of-words representations of train and test tweets :param tr_tweets: pandas Series of strings, raw texts to convert (from train set) :param te_tweets: pandas Series of strings, raw texts to convert (from test set) :param tr_targets: pandas Series of strings, target classes (from train set) :param te_targets: pandas Series of strings, target classes (from test set) :param per_target: bool, whether to find separate BoW repr for each target class :param max_feats: int, maximum number of words/ngrams to keep, number of dimensions in returned feature matrices :param normalise_counts: bool, whether to divide the counts within each tweet by the number of tokens (not for Multinomial NB) :param kwargs: to be passed onto sklearn CountVectorizer :return: tuple, training feature matrix, test feature matrix, list of feature names (with '_bow' appended to each) """ if per_target and not tr_targets.empty and not te_targets.empty: # Create different BoW for each target # Only useful if using max_features - as most common words/n-grams # May be for only one or two of the targets x_tr = np.zeros((tr_tweets.shape[0], max_feats), dtype=np.int64) x_te = np.zeros((te_tweets.shape[0], max_feats), dtype=np.int64) for _targ in tr_targets.unique(): word_bagger = text_sk.CountVectorizer(max_features=max_feats, **kwargs) x_tr[(tr_targets == _targ).values] = \ word_bagger.fit_transform(tr_tweets[(tr_targets == _targ).values].values).toarray() x_te[(te_targets == _targ).values] = \ word_bagger.transform(te_tweets[(te_targets == _targ).values].values).toarray() else: word_bagger = text_sk.CountVectorizer(max_features=max_feats, **kwargs) x_tr = word_bagger.fit_transform(tr_tweets).toarray() x_te = word_bagger.transform(te_tweets).toarray() if normalise_counts: # Normliase counts by length of tweet tr_tweet_lens = tr_tweets.apply( tokenize.TweetTokenizer().tokenize).apply(len) te_tweet_lens = te_tweets.apply( tokenize.TweetTokenizer().tokenize).apply(len) x_tr = np.divide(x_tr, tr_tweet_lens.values[:, np.newaxis]) x_te = np.divide(x_te, te_tweet_lens.values[:, np.newaxis]) return x_tr, x_te, [ _fn + '_bow' for _fn in word_bagger.get_feature_names() ]
def extract_features(features_train, features_test, post_train, post_test, cap_train, cap_test, title_train, title_test, low_train, low_test, digit_tr, digit_test): vectorizer_1 = text.CountVectorizer(ngram_range=(1, 1)) vectorizer_2 = text.CountVectorizer(ngram_range=(1, 1)) training_vector = vectorizer_1.fit_transform(features_train) test_vector = vectorizer_1.transform(features_test) ptraining_vector = vectorizer_2.fit_transform(post_train) ptest_vector = vectorizer_2.transform(post_test) training_vec = sp.sparse.hstack((training_vector, ptraining_vector, csr_matrix(cap_train).T, csr_matrix(title_train).T, csr_matrix(low_train).T, csr_matrix(digit_tr).T)) test_vect = sp.sparse.hstack((test_vector, ptest_vector, csr_matrix(cap_test).T, csr_matrix(title_test).T, csr_matrix(low_test).T, csr_matrix(digit_test).T)) return training_vec, test_vect
def get_test_doc_counts(input_seq_fname, vocab=None): lines = [] n_tokens = [] with open(input_seq_fname) as f: lines = [line for line in f] if vocab is None: cv = text.CountVectorizer() else: cv = text.CountVectorizer(vocabulary=vocab) doc_matrix = cv.fit_transform(lines) n_tokens = np.array(doc_matrix.sum(axis=1)) return n_tokens
def __init__(self, n_clusters=50, pca_n_components=20, kmpca_n_components=3, kernel_n_components=30): self.counter = text.CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=30, binary=True, lowercase=True) self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters, n_init=10, batch_size=10000, verbose=1) self.pca = decomposition.RandomizedPCA(n_components=pca_n_components) self.kmpca = decomposition.RandomizedPCA( n_components=kmpca_n_components) self.rbf = kernel_approximation.RBFSampler( n_components=kernel_n_components) self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30, max_depth=5, n_jobs=4) self.X_names = [ 'Title_CounterX', 'Title_ClusterdX', 'Title_KmX', 'Title_PCAX', 'Title_PCAClusterdX', 'Title_RbfX', 'Title_TreeX' ] self.linear_feature_selector = None
def __init__(self, n_clusters=50, pca_n_components=30, kmpca_n_components=3, kernel_n_components=30): ## use (min_df = 30, max_df = 0.5) to generate a lot of features - more choic for feature selection ## use (min_df = 0.001, max_df = 0.05) to generate fewer features - better clustering self.counter = text.CountVectorizer(stop_words='english', charset='utf-8', charset_error='ignore', ngram_range=(1, 1), min_df=0.001, max_df=0.05, binary=True, lowercase=True) self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters, n_init=10, batch_size=10000, verbose=1) self.pca = decomposition.RandomizedPCA(n_components=pca_n_components) self.kmpca = decomposition.RandomizedPCA( n_components=kmpca_n_components) self.rbf = kernel_approximation.RBFSampler( n_components=kernel_n_components) self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30, max_depth=5, n_jobs=4) self.X_names = [ 'Desc_CounterX', 'Desc_ClusterdX', 'Desc_KmX', 'Desc_PCAX', 'Desc_PCAClusterdX', 'Desc_RbfX', 'Desc_TreeX' ] self.linear_feature_selector = None
def parseLogs(inputFile, outputFile): vectorizer = ext.CountVectorizer(tokenizer=get_tokens, stop_words='english') with open(inputFile) as file: lines = [line.rstrip() for line in file] lineNos = dict(zip(range(1, len(lines)), lines)) doc_matrix = vectorizer.fit_transform(lines) tf_idf_transformer = ext.TfidfTransformer().fit(doc_matrix) sparse = tf_idf_transformer.transform(doc_matrix).toarray() perLineScore = [] for row in sparse: perLineScore.append(row.sum() / len(row.nonzero()[0])) lineScores = dict(zip(range(1, len(lines)), perLineScore)) df = pd.DataFrame([lineNos, lineScores]).T df.columns = ['d{}'.format(i) for i, col in enumerate(df, 1)] df = df.sort_values(by=['d2'], ascending=False) with open(outputFile, 'w') as outFile: for index, row in df.iterrows(): line = "{0:0=3d} {1}\n" outFile.write(line.format(index, row['d1']))
def comparison_test(text): import sklearn.feature_extraction.text as txt h_trick = txt.HashingVectorizer(n_features=20, binary=True, norm=None) oh_encoder = txt.CountVectorizer() oh_encoded = oh_encoder.fit_transform(text) hashing = h_trick.transform(text) return oh_encoded, hashing
def create_disaster_sequence(disaster_csv_path, category_name): disaster = ut.read_csv(disaster_csv_path) print('Getting Data...') X = disaster['message'].values Y = disaster[category_name].values x_train, x_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.3) print('Tokenizing and count vectorizing...') vect = st.CountVectorizer(tokenizer=lambda message: ( pt.pipe | __normalize_text__ | __tokenize_text__ | __remove_stopwords__ # | __stem_text__ | __lemmatize_text__)(message)) print('Tfidf transforming...') tfidf = st.TfidfTransformer() classifier = en.RandomForestClassifier() print('Fitting classifier on train...') x_train_counts = vect.fit_transform(x_train) x_train_tfidf = tfidf.fit_transform(x_train_counts) classifier.fit(x_train_tfidf, y_train) print('Running classifier on test...') x_test_counts = vect.transform(x_test) x_test_tfidf = tfidf.transform(x_test_counts) y_pred = classifier.predict(x_test_tfidf) print('Displaying results...') display_results(y_test, y_pred)
def build_model(X_train, X_test, y_train, y_test): """Build and evaluate a model. Also returns the test-set predictions.""" count_vect = sktext.CountVectorizer() tfidf_transformer = sktext.TfidfTransformer() X_train_counts = count_vect.fit_transform(X_train) X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_test_counts = count_vect.transform(X_test) X_test_tfidf = tfidf_transformer.transform(X_test_counts) X_train, X_test = feature_selection(X_train_tfidf, y_train, X_test_tfidf, 'all') model = svm.SVC(C=10, kernel='linear') # model = dummy.DummyClassifier(strategy="stratified") model.fit(X_train, y_train) LOGGER.info("Model trained") # Test trained model: predicted = model.predict(X_test) df_out = pd.DataFrame(y_test) df_out['pred'] = predicted df_out['target'] = y_test df_out['match'] = df_out['pred'] == df_out['target'] classifier = { "model": model, "counter": count_vect, "transformer": tfidf_transformer } return (classifier, df_out)
def __init__(self): ## NOTHING TODO # load emails x = open('emails.txt').read() emails = json.loads(x) # get previous spam emails (spam), non spam emails (not_spam), unclassified input mails (to_classify) spam = emails["spam"] not_spam = emails["not_spam"] to_classify = emails["to_classify"] # Number of emails n_spam = len(spam) n_not_spam = len(not_spam) n_to_classify = len(to_classify) ''' To ignore certain common words in English that might skew your model, we add them to the stop words list below. You may want to experiment by choosing your own list of stop words, but be sure to keep subject in this list at a minimum, as it appears in every email content.''' stop_words = text.ENGLISH_STOP_WORDS.union({'subject'}) # Form bag of words model using words used at least 10 times vectorizer = text.CountVectorizer(stop_words=stop_words, min_df=10) X = vectorizer.fit_transform(spam + not_spam + to_classify).toarray() # split word counts into separate matrices self.X_spam, self.X_not_spam, self.X_to_classify = X[:n_spam, :], X[ n_spam:n_spam + n_not_spam, :], X[n_spam + n_not_spam:, :]
def split_docs_by_repeated(input_seq_fname): """Take input doc files and find which ones are vs. are not repeated, as well as gathering information about the unigram language model of the repeated documents. Input: input_seq_fname: the path to a text file of Mallet-formatted data for repeats. Outputs: repeated_documents: list of integer ids of repeated documents *the union of these two should be range(25000) n_tokens: the length of each document in tokens doc_models: an array of word frequencies for each repeated document cv.vocabulary_: a map of vocabulary word to id """ repeated_docs = [] repeated_lines = [] n_tokens = [] repeats_mask = [] with open(input_seq_fname) as f: for line in f: doc_id, is_repeat, doc = line.split('\t') n_tokens.append(len(doc.split())) repeats_mask.append(is_repeat == 'True') _, original_id, line_id = doc_id.split('-') if is_repeat == 'True': repeated_lines.append(doc) repeated_docs.append(int(original_id)) cv = text.CountVectorizer() doc_models = cv.fit_transform(repeated_lines) n_tokens = np.array(n_tokens, dtype=int) repeats_mask = np.array(repeats_mask, dtype=bool) return repeated_docs, n_tokens, repeats_mask, doc_models, cv.vocabulary_
def BOW_vectors(classifying_docs, do_PCA, num_PCA): print "Beginning to train BOW..." start = time.clock() classifying_sentences = [" ".join(doc.words) for doc in classifying_docs] count_vectorizer = fet.CountVectorizer( analyzer="word", lowercase=True, binary=True, stop_words=None, tokenizer=TreebankWordTokenizer().tokenize) BOW_classifying = count_vectorizer.fit_transform(classifying_sentences) if do_PCA == True: u, s, _ = svds(A=csr_matrix.transpose(BOW_classifying.asfptype()), k=num_PCA, which="LM", return_singular_vectors=True) classifying_vectors = projected_BOW(BOW_classifying, u, s) else: classifying_vectors = BOW_classifying print "BOW matrix has %d unique terms" % (BOW_classifying.shape[1]) classifying_labels = [doc.sentiment for doc in classifying_docs] end = time.clock() print "Ending BOW training... took %f seconds" % (end - start) return classifying_vectors, classifying_labels
def __init__(self, cat_paths): ##в TfidfVectorizer нужно добавить аргументом массив со ##stopwords, чтобы лучше работало rus_dict = csv_parse('freqrnc2011.csv') stop_pos = { 'conj', 'anum', 'intj', 'advpro', 'spro', 'apro', 'pr', 'num', 'part' } stopwords = [i['Lemma'] for i in rus_dict if i['PoS'] in stop_pos] stopwords += ['vk', 'com', 'https', 'photo'] self.vectorizer = sktext.TfidfVectorizer(input='filename', stop_words=stopwords) self.alltexts_dict = dict() ##складываем имена файлов в словарь по категориям: for i in cat_paths: self.alltexts_dict[i.split('/')[-2]] = [ i + j for j in os.listdir(i) if re.search('\w', open(i + j, 'r', encoding='utf-8').read()) is not None ] self.alltexts_list = [] for k in self.alltexts_dict: self.alltexts_list += self.alltexts_dict[k] self.vectorizer.fit(self.alltexts_list) self.count_vectorizer = sktext.CountVectorizer( stop_words=stopwords, vocabulary=self.vectorizer.vocabulary_) self.features = self.vectorizer.get_feature_names() self.alltext = get_one_text(self.alltexts_list) self.counts_alltext = np.array( self.count_vectorizer.transform([self.alltext]).mean(axis=0))[0]
def get_data(): df = pd.read_table('SMSSpamCollection', sep='\t', header=None, names=['label', 'sms_message']) df['label'] = df.label.map({'ham': 0, 'spam': 1}) X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], df['label'], random_state=1, test_size=0.1) count_vector = text.CountVectorizer(ngram_range=[1, 4], analyzer='char_wb') # Fit the training data and then return the matrix training_data = count_vector.fit_transform(X_train) testing_data = count_vector.transform(X_test) # NEw layer of tf-idf for better model transformer = text.TfidfTransformer() training_data = transformer.fit_transform(training_data) testing_data = transformer.transform(testing_data) return (training_data, y_train), (testing_data, y_test), count_vector, transformer
def NMF(arquivo, nt): vector = text.CountVectorizer(input='arquivo', stop_words='english', min_df=1, strip_accents='unicode') arqArray = vector.fit_transform(arquivo).toarray() vocabulario = np.array(vector.get_feature_names()) ntw = arqArray.shape[0] #NMF num_topics = nt num_top_words = ntw #Decomposição clf = decomposition.NMF(n_components=num_topics, random_state=1) doctopic = clf.fit_transform(arqArray) topic_words = [] for topic in clf.components_: word_idx = np.argsort(topic)[::-1][0:num_top_words] topic_words.append([vocabulario[i] for i in word_idx]) with np.errstate(invalid='ignore'): doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True) #print(doctopic) FechaDocumento(arquivo) return(doctopic)
def fromAllDocsToBow(all_docs, strip_accents=u'ascii', lowercase=True, preprocessor=None, stop_words=None, token_pattern=u"[\\w']+\\w\\b", analyzer=u'word', max_df=1.0, max_features=20000, vocabulary=None, binary=False, ngram_range=(1, 1), min_df=1, normalize=True): """ Depuis un liste de documents, génère une matrice sparse contenant les occurences des mots. A chaque mot est associé un identifiant grace à une table de hashage. """ vec_param = txt.CountVectorizer(all_docs, strip_accents=strip_accents, lowercase=lowercase, preprocessor=preprocessor, stop_words=stop_words, token_pattern=token_pattern, analyzer=analyzer, max_df=max_df, max_features=max_features, vocabulary=vocabulary, binary=binary, ngram_range=ngram_range, min_df=min_df) bow = fromVectoBow(all_docs, vec_param, normalize) return bow, vec_param
def replace_one_words(sentences): #all_new_list = train_sentences + test_sentences + validation_sentences all_new_list = sentences count_vectorizer = fet.CountVectorizer( analyzer="word", lowercase=False, binary=True, stop_words=None, tokenizer=TreebankWordTokenizer().tokenize) X = count_vectorizer.fit_transform(all_new_list) feature_names = count_vectorizer.get_feature_names() word_counts = np.array(X.sum(0))[0] one_indices = np.where(word_counts == 1)[0] one_words = [feature_names[ind] for ind in one_indices] for ind in range(len(one_indices)): one_word_column = X[:, one_indices[ind]] one_word_sentence_ind = one_word_column.nonzero()[0].tolist()[0] all_new_list[one_word_sentence_ind] = all_new_list[ one_word_sentence_ind].replace(" " + one_words[ind] + " ", " UNKNOWN_WORD ") return "\n".join(all_new_list)
def build_vocab(train_data, test_data): """ Dataset Dictionary Builder Vectorizers and builds the combined dictionary of the test and training datasets. :param train_data: DataFrame of the training data :type train_data: pd.DataFrame :param test_data: DataFrame of the test data :type test_data: pd.DataFrame :return: Combined vocabulary for the training and testing. :rtype: dict """ # Token pattern is used to allow the vectorizer to include one letter words (e.g., "a", "i") vectorizer = sklearntext.CountVectorizer(lowercase=True, token_pattern=r"[\b|\w*#]*\w*[!+#*\w*|\b]*") words = set() for data_pd in [train_data, test_data]: vectorizer.fit_transform(data_pd[const.COL_TWEET]) vocab = vectorizer.vocabulary_ words |= set(vocab.keys()) vocab = {} for idx, word in enumerate(sorted(words)): vocab[word] = idx return vocab
def create_disaster_pipeline(disaster_csv_path, category_name): disaster = ut.read_csv(disaster_csv_path) print('Getting data...') X = disaster['message'].values Y = disaster[category_name].values x_train, x_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.3) print('Creating pipeline...') pipeline = pi.Pipeline([ ('vect', st.CountVectorizer( tokenizer=lambda text: (pt.pipe | __normalize_text__ | __tokenize_text__ | __remove_stopwords__ | __lemmatize_text__)(text))), ('tfidf', st.TfidfTransformer()), ('clf', en.RandomForestClassifier()) ]) print('Fitting pipeline...') pipeline.fit(x_train, y_train) print('Predicting with pipeline...') y_pred = pipeline.predict(x_test) print('Displaying results...') display_results(y_test, y_pred) pass
def _vectorizeProducts(self): prodvectorizer = text.CountVectorizer() prodvectorize = self._vectorize() dfmerged = pd.DataFrame() dfmerged["productid"] = self.df["productid"] dfmerged["title"] = self.df["title"] dfmerged["info"] = pd.concat([ self.df["title"].astype(str) + self.df["subtitle"].astype(str) + self.df["bullets"].astype(str) ], axis=1, join='inner') self.overall_prod_matrix = prodvectorize.fit_transform( dfmerged["info"]) dfmerged["count"] = range(0, len(dfmerged)) dfmerged["svd"] = self.overall_prod_matrix dfmerged["coordinate"] = dfmerged["count"].apply( lambda index: self.overall_prod_matrix[index, ]) dfmerged.drop("count", axis=1) dfmerged.to_csv("output.csv") self.overall_prod_matrix = self._reducedimensionality( self.overall_prod_matrix, features=self.overall_weight) return dfmerged '''
def extractTopic(self): """ * Tokenize the all words * Eliminates any word with less than two letters * Forms term frequency–inverse document frequency for each word * Generate Document-term_matrix * Classify topics based on Document-term_matrix and frequency–inverse document frequency * Gathers first "N" numbers from each topic """ self.vectorizer = text.CountVectorizer(input='filename', stop_words='english', min_df=2) for x in range(len(self.fileNames)): temp = self.fileNames[x] self.fileNames[x] = self.baseDirectory + temp self.dtm = self.vectorizer.fit_transform(self.fileNames).toarray() self.vocab = np.array(self.vectorizer.get_feature_names()) self.clf = decomposition.NMF(n_components=self.num_topics, random_state=1) self.doctopic = self.clf.fit_transform(self.dtm) for topic in self.clf.components_: word_idx = np.argsort(topic)[::-1][0:self.num_top_words] self.topic_words.append([self.vocab[i] for i in word_idx]) for t in range(len(self.topic_words)): print("Topic {}: {}".format(t, ' '.join(self.topic_words[t][:15])))
def count_vectorizer(df, col_name, vocab=None): """ String Vectorizer With Optional Dictionary Support Given a Pandas DataFrame, this function will tokenizer using either the provided dictionary or the one implicit to the data itself. It then returns a matrix of the tf-idf scores of each of the words. :param df: Source data frame to vectorize :type df: pd.DataFrame :param col_name: Name of the feature column in the pandas DataFrame :type col_name: string :param vocab: Dictionary of support dictionary words to mapping of index number :type vocab: dict :return: TF-IDF word matrix and vocabulary. :rtype: Tuple(pd.DataFrame, dict) """ stop_words = nltk.corpus.stopwords.words('english') vectorizer = sklearntext.CountVectorizer(lowercase=True, stop_words=stop_words, vocabulary=vocab) doc_word_matrix = vectorizer.fit_transform(df[col_name]) if vocab is None: vocab = vectorizer.vocabulary_ tf_idf = sklearntext.TfidfTransformer( norm=None).fit_transform(doc_word_matrix) return tf_idf.toarray(), vocab
def find_NMF_topics(self): """ :param num_topics: :param num_top_words: a list of top words for each topic :return: """ vectorizer = text.CountVectorizer(input='filename', stop_words='english', min_df=self.min_df, max_df=self.max_df) dtm = vectorizer.fit_transform(self.all_documents).toarray() vocab = np.array(vectorizer.get_feature_names()) clf = decomposition.NMF(n_components=self.num_topics, random_state=1) # it shows for how many proability each corpus is related to a word in topic results self.doctopic = clf.fit_transform(dtm) self.topic_words = [] for topic in clf.components_: word_idx = np.argsort(topic)[::-1][0:self.num_top_words] self.topic_words.append([vocab[i] for i in word_idx]) return
def train(): """ Builds the SVM based on training data. """ features, labels = __init__.load_data('train') vectorizer = text.CountVectorizer(decode_error='ignore', stop_words='english') transformer = text.TfidfTransformer() classifier = linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, tol=1e-3, random_state=42) # Serializes the processing steps that would be required of the above. text_clf = pipeline.Pipeline( steps=[('vect', vectorizer), ('tfidf', transformer), ('clf-sgdc', classifier)]) start = time.time() text_clf.fit(features, labels) print 'Training time:\t%1.4f seconds' % (time.time() - start) __init__.evaluate(text_clf, features, labels) return text_clf
def count_vectorizer(docs, ngram_range=(1, 1), max_features=None, **kwargs): """ Calculate counts of n-grams for a given collections of a list of string. Args: docs(List): a sequence of strings ngrams_range(Tuple): (min_n, max_n) to define lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used. max_features(int or None): If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. Returns: term frequency (count) vectorizer Vectorizer is trained on the input `docs`. Can be passed to modeling functions or to :func:`term_doc_matrix_to_pandas` to get a Pandas DataFrame. See the `scikit-learn CountVectorizer documentation <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_ for attributes of the vectorizer. """ docs = preprocessing._u(docs) vectorizer = sktext.CountVectorizer(ngram_range=ngram_range, max_features=max_features, **kwargs) vectorizer.fit(docs) return vectorizer
def __init__(self): # load Documents x = open('fedpapers_split.txt').read() papers = json.loads(x) # split Documents papersH = papers[0] # papers by Hamilton papersM = papers[1] # papers by Madison papersD = papers[2] # disputed papers # Number of Documents for H, M and D nH = len(papersH) nM = len(papersM) nD = len(papersD) '''To ignore certain common words in English that might skew your model, we add them to the stop words list below. You may want to experiment by choosing your own list of stop words, but be sure to keep 'HAMILTON' and 'MADISON' in this list at a minimum, as their names appear in the text of the papers and leaving them in could lead to unpredictable results ''' stop_words = text.ENGLISH_STOP_WORDS.union({'HAMILTON', 'MADISON'}) #stop_words = {'HAMILTON','MADISON'} # Form bag of words model using words used at least 10 times vectorizer = text.CountVectorizer(stop_words=stop_words, min_df=10) X = vectorizer.fit_transform(papersH + papersM + papersD).toarray() '''To visualize the full list of words remaining after filtering out stop words and words used less than min_df times uncomment the following line''' #print(vectorizer.vocabulary_) # split word counts into separate matrices self.XH, self.XM, self.XD = X[:nH, :], X[nH:nH + nM, :], X[nH + nM:, :]
def build_nontext_vector(fin, colname, colidx, normalize): """ Handles the specified column as a categorical variable. """ print "Building category vector for %s" % (colname) fout = str.replace(fin, ".csv", "." + colname + ".mtx") if os.path.isfile(fout): return ftmp = str.replace(fin, ".csv", ".tmp") reader = csv.reader(open(fin, 'rb')) tmpwriter = open(ftmp, 'wb') ln = 0 for row in reader: ln += 1 if ln <= 1: continue if ln % 1000 == 0: print "...(processed %d lines)" % (ln) colval = str.lower(row[colidx]) if normalize: colval = str.replace(colval, " ", "_") if len(colval.rstrip()) == 0: colval = "UNK" tmpwriter.write(colval + "\n") tmpwriter.close() tmpreader = open(ftmp, 'rb') vectorizer = sft.CountVectorizer(max_features=100) catmatrix = vectorizer.fit_transform(tmpreader) os.remove(ftmp) writer = open(fout, 'wb') sio.mmwrite(writer, catmatrix) writer.close()