def get_topic_term_tfidf(topic_texts, min_df=1): vector = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', min_df=min_df) vector.build_analyzer() tfidf = vector.fit_transform(topic_texts) return tfidf.toarray().sum(axis=0), vector
def tfidf(wordlist): from sklearn.feature_extraction.text import TfidfVectorizer tfidf_dict = {} tfidf = TfidfVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 2)) tfidf.build_analyzer() response = tfidf.fit_transform(wordlist) feature_names = tfidf.get_feature_names() for col in response.nonzero()[1]: tfidf_dict[feature_names[col]] = response[0, col] return tfidf_dict
def __init__(self, n_features, voc_file): self.n_features = n_features self.voc_file = voc_file self.word_clusters, self.grouped_words = self.read_word_cluster( voc_file) tfidf = TfidfVectorizer(encoding='iso-8859-1', stop_words='english') self.vectorize = tfidf.build_analyzer()
def get_vocab(texts, rate): vectorizer = TfidfVectorizer(min_df=5, stop_words='english') features = vectorizer.fit_transform(texts).tocsc() vocab = vectorizer.get_feature_names() analyzer = vectorizer.build_analyzer() df = 1. / np.exp(vectorizer.idf_ - 1) * (len(texts) + 1) - 1 word_value_list = [] for i, word in enumerate(vocab): assert len(features[:, i].data) == int(round(df[i])) word_value_list.append( [word, np.mean(features[:, i].data), len(features[:, i].data)]) word_value_list.sort(key=lambda t: t[1], reverse=True) total = sum([len(analyzer(text)) for text in texts]) word_counter = {word: 0 for word in vocab} for text in texts: for word in analyzer(text): if word in word_counter: word_counter[word] += 1 cnt = 0 result_list = [] for i, (word, _, df) in enumerate(word_value_list): result_list.append(word) cnt += word_counter[word] if cnt / total > rate: print(f'{i+1} words take {cnt / total} content.') break return result_list, analyzer
def consider_glove(): textual = TfidfVectorizer() tokenizer = textual.build_analyzer() def prepare(df: pd.DataFrame, fit: bool = False) -> Tuple[np.ndarray, np.ndarray]: y = np.array(df.label.values) N = len(y) D = len(glove["the"]) X = np.zeros((N, D)) for i, example in enumerate(df.text): count = 0 for word in tokenizer(example): if word in glove: count += 1 X[i] += glove[word] if count > 0: X[i] /= count return (y, X) y_train, X_train = prepare(train_f, fit=True) y_vali, X_vali = prepare(vali_f) y_test, X_test = prepare(test_f) m = SGDClassifier() m.fit(X_train, y_train) print("glove-Train-Acc: {:.3}".format(m.score(X_train, y_train))) print("glove-Vali-Acc: {:.3}".format(m.score(X_vali, y_vali)))
def analyze_corpus(images): "Preprocess the corpus." vectorizer = TfidfVectorizer(token_pattern='(?u)\\b\\w\\w\\w+\\b') documents = [' '.join(descriptions) for descriptions in images] vectorizer.fit(documents) analyzer = vectorizer.build_analyzer() return vectorizer, analyzer
def ida(articles): stopwords = [] doc_terms = [] with open('ch_stopwords.txt', 'r') as f: stopwords = set(f.read().lower().split('\n')) #print('stopwords', stopwords[:10]) trigram_vectorizer = TfidfVectorizer( ngram_range=(2, 3), token_pattern=r'([\u4e00-\u9fa5]{1}|)', min_df=10, max_df=20, stop_words=stopwords, analyzer='word') analyzer = trigram_vectorizer.build_analyzer() ''' for article in articles: terms = map(lambda x: x.replace(' ', ''), analyzer(article.Content)) #get rid of spaces #terms = set(map(lambda x: x.replace(' ', ''), analyzer(article.Content))) #get rid of spaces #terms = list(terms-stopwords) doc_terms.append(list(terms)) ''' article_contents = map(lambda x: x.Content, articles) doc_terms = trigram_vectorizer.fit_transform(article_contents) tf_feature_names = trigram_vectorizer.get_feature_names() print(len(tf_feature_names), tf_feature_names[100:200]) vocab = trigram_vectorizer.vocabulary_ joblib.dump(vocab, 'lda-vocab.pkl', compress=1) #print_top_words(lda, tf_feature_names, 10) #print(doc_terms.get_feature_names()) '''
def __init__(self, filename, doc_text_header=None, doc_id_header=None, num_phrases=10): # init basic variables self.time = time.time() self.num_phrases = num_phrases self.filename_full = filename.split('.')[0] self.filename = os.path.basename(filename).split('.')[0] self.time_filename = '{}_streaming_time.txt'.format(self.filename) self.ngrams = (3, 5) self.index_to_docid = Counter() self.docid_to_index = Counter() self.data = pandas.read_csv(filename, lineterminator='\n') self.determine_header_names(doc_text_header, doc_id_header) self.num_ads = len(self.data.index) self.cluster_graph = nx.Graph() # setup tfidf - we want to keep emojis and capitalization tfidf = TfidfVectorizer(token_pattern=r'[^\s]+', lowercase=False, ngram_range=self.ngrams, sublinear_tf=True) self.tokenizer = tfidf.build_analyzer() self.data[self.description] = self.data.apply(lambda r: filter_text( '{} {}'.format(r['title'], r[self.description])), axis=1) self.tfidfs = tfidf.fit_transform(self.data[self.description]) self.tfidf_indices = tfidf.get_feature_names() print('done with tfidf', time.time() - self.time)
def build_analyzer(self): analyzer = TfidfVectorizer.build_analyzer(self) NoPunctuation = lambda q: ''.join([x for x in q if x not in punctuation]) def innerFx(sentence): sentence = NoPunctuation(sentence) return analyzer(sentence) return innerFx
def tfidf_vector(): # 特征提取method3 tv = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') tfidf_train_2 = tv.fit_transform(newsgroup_train.data) tv2 = TfidfVectorizer(vocabulary=tv.vocabulary_) tfidf_test_2 = tv2.fit_transform(newsgroups_test.data) print("the shape of train is " + repr(tfidf_train_2.shape)) print("the shape of test is " + repr(tfidf_test_2.shape)) analyze = tv.build_analyzer() tv.get_feature_names() return tfidf_train_2, tfidf_test_2
def learn_vocabulary(docs, only_noun_phrases=True): first_occurrence_all = [] entropy_all = [] #docs = [doc.decode('utf8', 'ignore') for doc in docs] ''' noun_phrases = set() if only_noun_phrases: for i, doc in enumerate(docs): print "--extracting NP from doc", i #doc = doc.decode('utf8', 'ignore') noun_phrases.update([lemmatize(phrase) for phrase in extract_candidate_chunks(doc)]) with open('./semeval_train_docs_noun_phrases.set', 'w') as f: pickle.dump(noun_phrases, f) ''' print "loading pre-extracted set of noun_phrases" noun_phrases = set() with open('./semeval_train_docs_noun_phrases.set', 'r') as f: noun_phrases = pickle.load(f) vectorizer = TfidfVectorizer(decode_error='ignore', preprocessor=preprocess, ngram_range=(1, 3), tokenizer=tokenize) analyzer = vectorizer.build_analyzer() vocab = set() print "--learning vocabulary" for i, doc in enumerate(docs): print "--learning doc", i first_occurrence = {} entropy = {} phrases = analyzer(doc) # all phrases from doc doc = preprocess(doc) doc_length = len(doc) chunks = get_chunks(doc) for i, phrase in enumerate(phrases): if valid_ngram(phrase, noun_phrases) and phrase not in first_occurrence: try: pos = doc.find(phrase) except ValueError: print "--phrase: '{}' not found".format(phrase) continue first_occurrence[phrase] = pos / doc_length # calculate entropy entropy[phrase] = get_entropy(phrase, chunks) vocab.add(phrase) first_occurrence_all.append(first_occurrence) entropy_all.append(entropy) print "--size of vocabulary: ", len(vocab) return vocab, first_occurrence_all, entropy_all
def method3(newsgroup_train,newsgroups_test): print('*************************\nTfidfVectorizer\n*************************') from sklearn.feature_extraction.text import TfidfVectorizer tv = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') tfidf_train_2 = tv.fit_transform(newsgroup_train.data) tv2 = TfidfVectorizer(vocabulary=tv.vocabulary_) tfidf_test_2 = tv2.fit_transform(newsgroups_test.data) print("the shape of train is " + repr(tfidf_train_2.shape)) print("the shape of test is " + repr(tfidf_test_2.shape)) analyze = tv.build_analyzer() tv.get_feature_names() # statistical features/terms
class GraphsizePretrained(BaseEstimator, TransformerMixin): def __init__(self, w=2, pretrained_vec='glove.6B.100d', verbose=False): super(GraphsizePretrained, self).__init__() self.w = w self.pretrained_vec = pretrained_vec self.embeddings_dict = {} if not verbose: self.progress_bar = lambda x: x else: from tqdm import tqdm self.progress_bar = tqdm with open(self.pretrained_vec, 'r') as f: for line in self.progress_bar(f): values = line.split() word = values[0] vector = np.asarray(values[1:], "float32") self.ndim = len(vector) self.embeddings_dict[word] = vector self.vocab = { word: i for (i,word) in enumerate( self.embeddings_dict.keys() ) } self.analyzer = TfidfVectorizer(preprocessor=preprocessor) def fit(self, X, y=None): self.N = len(X) return self def transform(self, text): docs = list(map(self.analyzer.build_analyzer(), self.progress_bar(text))) result = list(map(self._build_graph_, self.progress_bar(docs))) return result def _build_graph_(self, doc): terms = list(filter( lambda x: x in self.embeddings_dict, doc)) sorted_terms = sorted(list(set(terms))) cooccur_count = Counter() for i,idt in enumerate(terms): terms_to_add = terms[ max(i-self.w, 0):i ] terms_to_add = list(zip(terms_to_add, repeat(idt))) terms_to_add = list(map(sorted,terms_to_add)) terms_to_add = list(map(tuple,terms_to_add)) cooccur_count.update( terms_to_add ) G = nx.Graph() G.add_nodes_from( sorted_terms ) w_edges = [ (s,t,w) for ((s,t),w) in cooccur_count.items() ] G.add_weighted_edges_from( w_edges, weight='freq' ) return G, np.array([ self.embeddings_dict[term] for term in sorted_terms ])
def vectorize(u_plus_v, batch_size, list_documents, labels, args): #print(i) #d, y = args #print(args) start_index = args vectorizer = TfidfVectorizer() analyze = vectorizer.build_analyzer() start = datetime.datetime.now() for d, y in zip( list_documents[batch_size * start_index:batch_size * (start_index + 1)], labels[batch_size * start_index:batch_size * (start_index + 1)]): document = [] i = 0 for w in analyze(d): try: if u_plus_v: glove = (global_V['u'][global_D[w]] + global_V['v'][global_D[w]]) / 2 else: glove = global_V['u'][global_D[w]] document.append(glove) except KeyError as e: i = i + 1 #if i>0: # print("missing words " + str(i) + " / " + str(len(d))) #if len(mean_d == 0): # raise Exception("Empty mean_d") if len(document) > 0: mean_d = np.mean(document, axis=0) #if X is None: # X = mean_d #else: # #pdb.set_trace() # X = np.vstack([X, mean_d]) #with lock: global_X.append(mean_d) global_Y.append(y) end = datetime.datetime.now() delta = end - start print("process in " + str(delta.total_seconds()) + "s")
class Analyzer(object): def __init__(self): self.tfidf = TfidfVectorizer(min_df=1, binary=False, ngram_range=(1, 3), tokenizer=Tokenizer()) self.tokens = self.tfidf.build_tokenizer() self.ngram = self.tfidf.build_analyzer() def __call__(self, sentence): ret = self.ngram(sentence) terms = self.tokens(sentence) for term in terms: cate = term_category(term) if term != cate: ret.append(cate) return ret
def learn_vocabulary(docs, only_noun_phrases=True): first_occurrence_all = [] entropy_all = [] #docs = [doc.decode('utf8', 'ignore') for doc in docs] ''' noun_phrases = set() if only_noun_phrases: for i, doc in enumerate(docs): print "--extracting NP from doc", i #doc = doc.decode('utf8', 'ignore') noun_phrases.update([lemmatize(phrase) for phrase in extract_candidate_chunks(doc)]) with open('./semeval_train_docs_noun_phrases.set', 'w') as f: pickle.dump(noun_phrases, f) ''' print "loading pre-extracted set of noun_phrases" noun_phrases = set() with open('./semeval_train_docs_noun_phrases.set', 'r') as f: noun_phrases = pickle.load(f) vectorizer = TfidfVectorizer(decode_error='ignore', preprocessor=preprocess, ngram_range=(1, 3), tokenizer=tokenize) analyzer = vectorizer.build_analyzer() vocab = set() print "--learning vocabulary" for i, doc in enumerate(docs): print "--learning doc", i first_occurrence = {} entropy = {} phrases = analyzer(doc) # all phrases from doc doc = preprocess(doc) doc_length = len(doc) chunks = get_chunks(doc) for i, phrase in enumerate(phrases): if valid_ngram(phrase, noun_phrases) and phrase not in first_occurrence: try: pos = doc.find(phrase) except ValueError: print "--phrase: '{}' not found".format(phrase) continue first_occurrence[phrase] = pos / doc_length # calculate entropy entropy[phrase] = get_entropy(phrase, chunks) vocab.add(phrase) first_occurrence_all.append(first_occurrence) entropy_all.append(entropy) print "--size of vocabulary: ", len(vocab) return vocab, first_occurrence_all, entropy_all
def feed(param): values=[] result={} tweetdata = rawtweets.find() json_str =json_util.dumps(tweetdata) tweetdata =json_util.loads(json_str) path = os.path.dirname(os.path.realpath(__file__)) texts = [] for tweetlist in tweetdata: tweet = tweetlist["text"] print(tweet) #d = datetime.strptime(tweetlist["_id"], '%Y/%m/%d/%H') text = unicodedata.normalize('NFKD', tweet).encode('ascii','ignore').decode('utf-8') texts.append(text) vectorizer = TfidfVectorizer( analyzer='char', #token_pattern=r'[a-z]{4,}', #use_idf=True, #strip_accents='unicode', #sublinear_tf=False ) print(len(texts)) vectorizer.build_analyzer() idf = vectorizer.fit_transform(texts) feature_names = np.asarray(vectorizer.get_feature_names()) #print(idf.todense().T) #print((idf * idf.T).A) #print(idf.data) print("len ",(feature_names)) z = (zip(feature_names,idf.data)) d = {} for t in z: #print(t[0],t[1]) d[t[0]] = t[1] #print(d) return d
def create_analyser(data, col, type_ngrams='words'): if type_ngrams == 'words': k1 = 1 k2 = 1 elif type_ngrams == 'N_grams': k1 = 1 k2 = 3 elif type_ngrams == 'Only_N_grams': k1 = 2 k2 = 3 vectorizer = TfidfVectorizer(ngram_range=(k1, k2), lowercase=False, stop_words=None) vectorizer.fit(list(data[col])) analyser = vectorizer.build_analyzer() return analyser
def extract_candidates_doc(doc, phrase_list, idf_vec, training_size=450): #vocab = set(phrase_list) idf_dic = {} #print "phrase list len", len(phrase_list) #print "len idf_vec", len(idf_vec) for i, phrase in enumerate(phrase_list): idf_dic[phrase] = idf_vec[i] noun_phrases = set() print "--extracting NP" noun_phrases = set( [lemmatize(phrase) for phrase in extract_candidate_chunks(doc)]) vectorizer = TfidfVectorizer(decode_error='ignore', preprocessor=preprocess, ngram_range=(1, 3), tokenizer=tokenize) analyzer = vectorizer.build_analyzer() phrases = list( set([ phrase for phrase in analyzer(doc) if valid_ngram(phrase, noun_phrases) ])) doc = preprocess(doc) #print "candidate phrases", phrases #tfidf = [] #first_occurrence = [] #entropy = [] #length = [] doc_len = len(doc) entropy = get_entropy_doc(doc, phrases) # get feature vectors features = [] for i, phrase in enumerate(phrases): first_occurrence = doc.find(phrase) / doc_len tf = doc.count(phrase) if phrase in idf_dic: tfidf = tf * idf_dic[phrase] else: tfidf = tf * log10(training_size) feature_vec = get_feature_vector(phrase, tfidf, first_occurrence, entropy[i]) features.append(feature_vec) return phrases, features
def tfidf_vectorize(train_words, test_words): #method 2:TfidfVectorizer print( '*************************\nTfidfVectorizer\n*************************' ) from sklearn.feature_extraction.text import TfidfVectorizer tv = TfidfVectorizer(sublinear_tf=True) # , max_df = 0.5 tfidf_train_2 = tv.fit_transform(train_words) #得到矩阵 tv2 = TfidfVectorizer(vocabulary=tv.vocabulary_) tfidf_test_2 = tv2.fit_transform(test_words) print("the shape of train is " + repr(tfidf_train_2.shape)) print("the shape of test is " + repr(tfidf_test_2.shape)) analyze = tv.build_analyzer() tv.get_feature_names() #statistical features/terms return tfidf_train_2, tfidf_test_2
def find_tfidf(self): ''' pre-calculate tfidf ''' print('Finding tfidf...') stop_words = set(stopwords.words('english')).update( set(stopwords.words('italian'))) vectorizer = TfidfVectorizer(lowercase=True, ngram_range=self.ngrams, norm='l2', smooth_idf=True, stop_words=stop_words, min_df=2, max_df=0.8) self.data[self.description] = self.data[self.description].apply( self.filter_text) self.tfidf = vectorizer.fit_transform(self.data[self.description]) self.tfidf_indices = vectorizer.get_feature_names() self.tokenizer = vectorizer.build_analyzer()
def create_char_vectorizer(sentences): #Create TF-IDF object tfidf_char_vectorizer = TfidfVectorizer(analyzer='char_wb', max_df=0.90, max_features=200000, min_df=0.05, use_idf=True, ngram_range=(1, 3)) tfidf_char_vectorizer = tfidf_char_vectorizer.fit(sentences) tfidf_matrix = tfidf_char_vectorizer.transform(sentences) print(tfidf_matrix) dense_matrix = tfidf_matrix.todense() print(dense_matrix) print(tfidf_char_vectorizer.get_feature_names()) analyze = tfidf_char_vectorizer.build_analyzer() print(analyze("To Sherlock Holmes she is always _the_ woman.")) return (tfidf_char_vectorizer, tfidf_matrix)
def generate_sentences(): print('Generating Clause Set') tf = TfidfVectorizer(token_pattern=r'(?u)\b[a-zA-Z]{2,}\b', max_df=1) analyser = tf.build_analyzer() all_sections = session.query(Section).filter(Section.source_id.isnot(None)) docs = [] for s in all_sections: for c in s.clauses: if c.cleaned is not None and 'deleted' not in c.cleaned.lower(): docs.append(analyser(c.header)) sentences = re.split( r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', c.cleaned) docs.extend([analyser(sent) for sent in sentences]) return docs
def old_vectorizer_glove(list_documents, labels, D, V, u_plus_v=False): X = None Y = [] # see https://scikit-learn.org/stable/modules/feature_extraction.html vectorizer = TfidfVectorizer() analyze = vectorizer.build_analyzer() j = 0 for d, y in zip(list_documents, labels): document = [] i = 0 for w in analyze(d): try: if u_plus_v: glove = (V['u'][D[w]] + V['v'][D[w]]) / 2 else: glove = V['u'][D[w]] document.append(glove) except KeyError as e: i = i + 1 #if i>0: # print("missing words " + str(i) + " / " + str(len(d))) #if len(mean_d == 0): # raise Exception("Empty mean_d") if len(document) > 0: mean_d = np.mean(document, axis=0) if X is None: X = mean_d else: #pdb.set_trace() X = np.vstack([X, mean_d]) Y.append(y) j = j + 1 if j % 1000 == 0: print(j) return X, np.array(Y)
def train(self, segments, ignore_before=4, ignore_after=4): ''' This uses the 20newsgroups dataset for idf Parameters: :segments: list of strings where each string is a segment ''' data = fetch_20newsgroups(subset='train').data stripped_data = [] for d in data: lines = d.split('\n') if len(lines)>ignore_before+ignore_after: stripped_data.append('\n'.join(lines[ignore_before:-ignore_after])) txt = ''.join(segments) stripped_data.append(txt) # Train corpus tf-idf tfidf_corpus = TfidfVectorizer(stop_words='english') tfidf_corpus.fit(stripped_data) book_scores = tfidf_corpus.transform([txt]) print 'Learned {} features CORPUS'.format(len(tfidf_corpus.get_feature_names())) # Train document segment-wise tf-idf tfidf_book = TfidfVectorizer(vocabulary=tfidf_corpus.vocabulary_) segment_scores = tfidf_book.fit_transform(segments) print 'Learned {} features BOOK'.format(len(tfidf_book.get_feature_names())) # Now get word scores in each segment final_scores = book_scores.multiply(segment_scores) idx_to_word = tfidf_corpus.get_feature_names() word_scores = [] for i, segment_scores in enumerate(final_scores): scores = {} for j in segment_scores.indices: scores[idx_to_word[j]] = segment_scores[0, j] word_scores.append(scores) self.word_scores = word_scores self.analyze = tfidf_corpus.build_analyzer()
def keyword_extractor_tfidf(corpus_list,is_stop_words_allowed,n_gram_min,n_gram_max): if n_gram_min > n_gram_max: raise Exception('Invalid input n_gram_min should be <= n_gram_max') corpus = [] for doc in corpus_list: text = '' for word in doc: text = text +' '+ word corpus.append(text) if is_stop_words_allowed == False: vectorizer = TfidfVectorizer(ngram_range=(n_gram_min, n_gram_max),stop_words='english') else: vectorizer = TfidfVectorizer(ngram_range=(n_gram_min, n_gram_max)) analyzer = vectorizer.build_analyzer() analyzer(corpus[0]) features_array = vectorizer.fit_transform(corpus).toarray() features_transform_list = features_array.tolist()[0] features_dictionary = dict(zip(vectorizer.get_feature_names(),features_transform_list)) sorted_features_dictionary = OrderedDict(sorted(features_dictionary.items(),key=itemgetter(1))) return sorted_features_dictionary
def tfidf(text): vectorizer = TfidfVectorizer() transformer = TfidfTransformer() countVector = vectorizer.fit_transform(text) mat = transformer.fit_transform(countVector.toarray()).toarray() analyze = vectorizer.build_analyzer() threshold = 0.0001 key_dict = {} for i in range(len(text)): tokens = analyze(text[i]) for j in range(len(tokens)): if mat[i][j] > threshold: key_dict[tokens[j]] = mat[i][j] l1, l2 = [], [] s = [(k, key_dict[k]) for k in sorted(key_dict, key=key_dict.get)] for k, v in s: l1.append(k) l2.append(v) return l1, l2
def fit(self, templates): if self.vocabulary and self.analyser: pass else: vectorizer = TfidfVectorizer( ngram_range=(self.config_dict["min_n_gram"], self.config_dict["max_n_gram"]), lowercase=True, stop_words=None, min_df=1) vectorizer.fit(templates) self.analyser = vectorizer.build_analyzer() self.vocabulary = vectorizer.vocabulary_ save_object( os.path.join(self.feature_extraction_folder, "analyzer.pickle"), self.analyser) save_object( os.path.join(self.feature_extraction_folder, "vocabulary.pickle"), self.vocabulary) inputs = self.feature_engineering(templates) self.maxlen = max(max(len(x) for x in inputs), self.maxlen)
def preprocessAll(filename, percent): vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words=None) analyzer = vectorizer.build_analyzer() all_text = [] line_cnt = 0 with open(filename, 'r') as f: for line in f: if line_cnt % 10000 == 0: sys.stdout.flush() sys.stdout.write(" " * 25 + '\r') sys.stdout.flush() sys.stdout.write(str(line_cnt) + " lines processed.\r") line_cnt += 1 # if line_cnt >= 2000000: # break preprocessed = preprocess(line, analyzer) all_text.append(preprocessed) line_cnt = int(len(all_text)*percent) return all_text[0:line_cnt]
def ida(articles): stopwords = [] doc_terms = [] with open('ch_stopwords.txt', 'r') as f: stopwords = set(f.read().lower().split('\n')) #print('stopwords', stopwords[:10]) trigram_vectorizer = TfidfVectorizer( ngram_range=(2, 3), token_pattern=r'([\u4e00-\u9fa5]{1}|)', min_df=10, max_df=20, stop_words=stopwords, analyzer='word') analyzer = trigram_vectorizer.build_analyzer() ''' for article in articles: terms = map(lambda x: x.replace(' ', ''), analyzer(article.Content)) #get rid of spaces #terms = set(map(lambda x: x.replace(' ', ''), analyzer(article.Content))) #get rid of spaces #terms = list(terms-stopwords) doc_terms.append(list(terms)) ''' article_contents = map(lambda x: x.Content, articles) doc_terms = trigram_vectorizer.fit_transform(article_contents) tf_feature_names = trigram_vectorizer.get_feature_names() print(len(tf_feature_names), tf_feature_names[100:200]) lda = LatentDirichletAllocation(n_topics=8, max_iter=200, evaluate_every=10, n_jobs=-1, verbose=1, learning_method='online') lda.fit(doc_terms) joblib.dump(lda, 'lda-n8-2.pkl', compress=1) print_top_words(lda, tf_feature_names, 10) #print(doc_terms.get_feature_names()) '''
def sentence_tokenizer(dataset_name="pascal"): """ Parameters ---------- dataset_name : string 'memorability' or 'pascal' or 'clipart' Returns ------- analyze : object breaks sentences into words using scikit-learn tokenizer vectorizer : object of class TfidfVectorizer see scikit-learn documentation """ if dataset_name == "memorability": mat = scipy.io.loadmat("../../data/sentences/memorability_888_img_5_sent.mat") sentences = mat["memorability_sentences"] elif dataset_name == "pascal": mat = scipy.io.loadmat("../../data/sentences/pascal_1000_img_50_sent.mat") sentences = mat["pascal_sentences"] elif dataset_name == "clipart": mat = scipy.io.loadmat("../../data/sentences/clipart_500_img_48_sent.mat") sentences = mat["clipart_sentences"] # Build corpus corpus = list() for sent_group in sentences: corpus.append(" ".join([sent[0] for sent in sent_group])) ### Build tf-idf vectorizer ### # at-least three letters in word vectorizer = TfidfVectorizer(token_pattern="(?u)\\b\\w\\w\\w+\\b") vectorizer.fit(corpus) analyze = vectorizer.build_analyzer() return analyze, vectorizer
def preprocess(raw_docs, stopwords, min_df=3, min_term_length=2, ngram_range=(1, 1), apply_tfidf=True, apply_norm=True, tokenizer=custom_tokenizer): """ Preprocess a list containing text documents stored as strings. """ # Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call if apply_norm: norm_function = "l2" else: norm_function = None tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df=min_df, ngram_range=ngram_range) X = tfidf.fit_transform(raw_docs) analyze = tfidf.build_analyzer() docs = [analyze(doc) for doc in raw_docs] terms = [] # store the vocabulary map v = tfidf.vocabulary_ for i in range(len(v)): terms.append("") for term in v.keys(): terms[v[term]] = term return (X, terms, tfidf, docs)
def create_vectorizer(sentences): #Create TF-IDF object stopword_list = read_in_csv(stopwords_file_path) stemmed_stopwords = [ tokenize_and_stem(stopword)[0] for stopword in stopword_list ] stopword_list = stopword_list + stemmed_stopwords tfidf_vectorizer = TfidfVectorizer(max_df=0.90, max_features=200000, min_df=0.05, stop_words=stopword_list, use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3)) tfidf_vectorizer = tfidf_vectorizer.fit(sentences) tfidf_matrix = tfidf_vectorizer.transform(sentences) print(tfidf_matrix) dense_matrix = tfidf_matrix.todense() print(dense_matrix) print(tfidf_vectorizer.get_feature_names()) analyze = tfidf_vectorizer.build_analyzer() print(analyze("To Sherlock Holmes she is always _the_ woman.")) return (tfidf_vectorizer, tfidf_matrix)
def __init__(self, data, n_features=10, preprocess=False, jobs=1, verbose=True): self._clusters = None self._labels = [] self._data = data self._verbose = verbose self._n_features = n_features if preprocess: analyzer = TfidfVectorizer.build_analyzer() ipp = InputPreprocessor(None) def preprocess(doc): return [ipp.normalise(word) for word in analyzer(doc)] vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, stop_words='english', use_idf=True, analyzer=preprocess) else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, stop_words='english', use_idf=True) self._preprocessed_data = vectorizer.fit_transform(self._data) self._jobs = jobs
def extract_candidates_doc(doc, phrase_list, idf_vec, training_size = 450): #vocab = set(phrase_list) idf_dic = {} #print "phrase list len", len(phrase_list) #print "len idf_vec", len(idf_vec) for i, phrase in enumerate(phrase_list): idf_dic[phrase] = idf_vec[i] noun_phrases = set() print "--extracting NP" noun_phrases = set([lemmatize(phrase) for phrase in extract_candidate_chunks(doc)]) vectorizer = TfidfVectorizer(decode_error='ignore', preprocessor=preprocess, ngram_range=(1, 3), tokenizer=tokenize) analyzer = vectorizer.build_analyzer() phrases = list(set([phrase for phrase in analyzer(doc) if valid_ngram(phrase, noun_phrases)])) doc = preprocess(doc) #print "candidate phrases", phrases #tfidf = [] #first_occurrence = [] #entropy = [] #length = [] doc_len = len(doc) entropy = get_entropy_doc(doc, phrases) # get feature vectors features = [] for i, phrase in enumerate(phrases): first_occurrence = doc.find(phrase) / doc_len tf = doc.count(phrase) if phrase in idf_dic: tfidf = tf * idf_dic[phrase] else: tfidf = tf * log10(training_size) feature_vec = get_feature_vector(phrase, tfidf, first_occurrence, entropy[i]) features.append(feature_vec) return phrases, features
sentences = scipy.io.loadmat('../../data/sentences/memorability_888_img_5_sent.mat') sentences = sentences['memorability_sentences'] f = open('../../automated_specificity.txt', 'w') sent_pairs, scores_w = list(), list() vectorizer = TfidfVectorizer(token_pattern='(?u)\\b\\w\\w\\w+\\b') corpus = list() # Build corpus for sent_group in sentences: corpus.append(' '.join([sent[0] for sent in sent_group])) vectorizer.fit(corpus) analyze = vectorizer.build_analyzer() specificity_max, specificity_w = list(), list() for im_idx, sentence_group in enumerate(sentences): similarity_max, similarity_w = list(), list() for (sent1, sent2) in combinations(sentence_group, 2): words1, words2 = analyze(sent1[0]), analyze(sent2[0]) sent1_weights = [vectorizer.transform(sent1).toarray()[0][vectorizer.vocabulary_.get(w)] for w in words1] sent2_weights = [vectorizer.transform(sent2).toarray()[0][vectorizer.vocabulary_.get(w)] for w in words2] print >> f, [w.encode('utf-8') for w in words1] print >> f, [PrettyFloat(w) for w in sent1_weights] print >> f, [w.encode('utf-8') for w in words2]
data_full.append(SiteData('fb/srsplit/fullfbsearch_results_combined{i:02d}'.format(i=file_counter),categories, full_candidate_dict)) """ data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) """ print 'data loaded' import conversions as conv from ersatzpg.utffile import utffile special_terms = [] vocabulary = [] basic_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, use_idf=False, stop_words='english') basic_analyze = basic_vectorizer.build_analyzer() with utffile('searchterms.csv') as f: for s in f: if s.startswith('<'): special_terms.append(s.strip('<>')) else: vocabulary.append(s.decode('utf-8').strip()) fb_page_data = {} with open('fb/facebookpolsurls_bkp.csv') as f: csvr = csv.DictReader(f) for l in csvr: fb_page_data.update({l['url']:{'fans':l['Fan Count'].replace(',',''),'authentic':l['Authentic Category']}}) def analyze(s): d=eval(s) special_keys = []
def build_analyzer(self): analyzer = TfidfVectorizer.build_analyzer(self) english_stemmer = SnowballStemmer('english') return lambda doc:(english_stemmer.stem(w) for w in analyzer(doc))
def __init__(self, n_features, voc_file): self.n_features = n_features self.voc_file = voc_file self.word_clusters, self.grouped_words = self.read_word_cluster(voc_file) tfidf = TfidfVectorizer(encoding = 'iso-8859-1', stop_words='english') self.vectorize = tfidf.build_analyzer()
class WeightedEmbeddingSearch: def __init__(self): print("Loading data csv") #fun_fact_title_data = pd.read_csv(FUN_FACT_TITLE_CSV).dropna(subset=REQUIRED_COLUMNS) til_title_data = pd.read_csv(TIL_TITLE_CSV).dropna( subset=REQUIRED_COLUMNS) #ysk_title_data = pd.read_csv(YSK_TITLE_CSV).dropna(subset=REQUIRED_COLUMNS) self.title_data = pd.concat( [ #fun_fact_title_data, til_title_data, #ysk_title_data, ], join='inner').reset_index(drop=True) print("Computing tf-idf matrix") self.vectorizer = TfidfVectorizer(stop_words='english', dtype=np.float32) tfidf_matrix = self.vectorizer.fit_transform(self.title_data["title"]) print("Loading spacy") self.nlp = spacy.load('en_core_web_lg') print("Computing weighted embeddings") features = self.vectorizer.get_feature_names() self.f_vectors = np.array([self.nlp.vocab[f].vector for f in features]) weighted_embeddings = tfidf_matrix.dot(self.f_vectors) assert weighted_embeddings.shape == (len(self.title_data.index), 300) self.n_weighted_embeddings = weighted_embeddings / ( np.linalg.norm(weighted_embeddings, axis=1)[:, np.newaxis] + EPS) #print("Compressing pandas dataframe into index") #self.index = list(title_data.itertuples()) print("Done loading {} rows".format(len(self.title_data.index))) def search(self, query, method='similarity', top=10): query_tfidf = self.vectorizer.transform([query]) if query_tfidf.count_nonzero() > 0: query_weighted = query_tfidf.dot(self.f_vectors).flatten() # average word embeddings if query words don't exist in our corpus (tfidf matrix) else: tokens = self.vectorizer.build_analyzer()(query) # query was all stopwords, so we'll have to manually tokenize if not tokens: tokens = query.lower().split() query_weighted = np.average( [self.nlp.vocab[t].vector for t in tokens], axis=0).flatten() # if we have no embeddings for the given query, we're out of luck if np.count_nonzero(query_weighted) == 0: return [] n_query_weighted = query_weighted / (np.linalg.norm(query_weighted) + EPS) rankings = self.n_weighted_embeddings.dot(n_query_weighted) rankings_index = np.argsort(-rankings) ranked_df = self.title_data.loc[rankings_index] ranked_titles = list(ranked_df['title']) ranked_scores = list(ranked_df['score']) top_ranked_em = self.n_weighted_embeddings[rankings_index] ranked_rankings = rankings[rankings_index] results = self.kMeans(ranked_titles, ranked_scores, ranked_rankings, top_ranked_em, method) # index = list(ranked_df.itertuples()) results = [{ "type": "submission", "title": ranked_df.iloc[d]["title"], "subreddit": ranked_df.iloc[d]['subreddit'], "permalink": ranked_df.iloc[d]['permalink'], "score": ranked_df.iloc[d]['score'] } for d in [i[1][0] for i in results]] return results def kMeans(self, titles, scores, rankings, embeddings, method): TOP_HITS_KMEANS = max(40, np.sum(scipy.stats.zscore(rankings) > 3.5)) if TOP_HITS_KMEANS > 200: TOP_HITS_KMEANS = 200 kmeans = KMeans(n_clusters=20, random_state=0).fit(embeddings[:TOP_HITS_KMEANS]) counter = collections.Counter(kmeans.labels_) most_common = counter.most_common(10) most_common = set([i[0] for i in most_common]) results = self.topSimOfEachCluster(kmeans.labels_, 10, most_common) self.topScoreOfEachCluster(results, 4, scores) results = self.topResultsSorted(results, rankings, scores, method) return results # cluster number to top num based on similarity def topSimOfEachCluster(self, cluster_labels, num, most_common): res = {} clusters_included = set(most_common) for i, el in enumerate(cluster_labels): if el not in clusters_included: continue if el not in res: res[el] = [i] elif len(res[el]) < num: res[el].append(i) return res #takes topOfEachCluster and gets the top num by score def topScoreOfEachCluster(self, sim_results, num, scores): for key in sim_results: sim_results[key].sort(key=lambda x: scores[x], reverse=True) sim_results[key] = sim_results[key][:num] #sort results by method def topResultsSorted(self, results, rankings, scores, method='similarity'): if method == 'similarity': for key in results: results[key].sort(key=lambda x: rankings[x], reverse=True) #sorts within a cluster sorted_results = sorted(results.items(), key=lambda x: rankings[x[1][0]], reverse=True) #sorts all clusters elif method == 'score': for key in results: results[key].sort(key=lambda x: scores[x], reverse=True) sorted_results = sorted(results.items(), key=lambda x: scores[x[1][0]], reverse=True) return sorted_results
def main(): reload(sys) sys.setdefaultencoding('utf-8') pprint(LemmaTokenizer()("this is testing the stemming functionality")) param_grid = [ {'C': [.125, .25, .5, 1, 10, 100, 1000]}, { 'penalty': ('l1','l2')} ] svm_param_grid = [ {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}, ] lines = [line for line in fileinput.input()] sentences = map(lambda x: x.split('\t')[1], lines) Y = map(lambda x: int(x.split('\t')[0]), lines) vectorizer = TfidfVectorizer(min_df=1, tokenizer=POSTokenizer(), preprocessor=preprocess_sentence, ngram_range=(2,2), stop_words='english') pipeline = Pipeline([ ('vect', vectorizer), ('clf', SGDClassifier()), ]) # pprint(parameters) # t0 = time() # grid_search.fit(sentences, Y) # print("done in %0.3fs" % (time() - t0)) # print() # print("Best score: %0.3f" % grid_search.best_score_) X = vectorizer.fit_transform(sentences) num_samples = len(Y) num_train = int(num_samples * .8) print "Num training: %d" % num_train X_train = X[0:num_train] Y_train = Y[0:num_train] X_test = X[num_train:] Y_test = Y[num_train:] analyze = vectorizer.build_analyzer() for sentence in sentences[0:10]: print preprocess_sentence(sentence) print analyze(sentence) print "LemmaTokenizer" + str(LemmaTokenizer()(sentence)) print StemmingTokenizer()(sentence) # tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] # tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] # chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True) logistic = linear_model.LogisticRegression(C=.5, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0) # grid_search = GridSearchCV(SVC(), svm_param_grid, n_jobs=-1, verbose=1) # grid_search.fit(X_train, Y_train) # print grid_search.score(X_test, Y_test) # best_parameters = grid_search.best_estimator_.get_params() # print best_parameters # grid_search = GridSearchCV(logistic, param_grid, n_jobs=-1, verbose=1) # grid_search.fit(X_train, Y_train) # print grid_search.score(X_test, Y_test) # best_parameters = grid_search.best_estimator_.get_params() # print best_parameters print logistic.fit(X_train,Y_train).score(X_test,Y_test) show_most_informative_features(vectorizer, logistic, 25) num_errors = 0 feature_names = vectorizer.vocabulary_ feature_index = inv_map = {v: k for k, v in feature_names.items()} y_pred = [] for (i,x) in enumerate(X_test): y_hat = logistic.predict(x) y_pred.append(y_hat) if y_hat != Y_test[i]: num_errors += 1 print "\n\nError predicting sentence: " + sentences[i + num_train] print print_features(x, feature_index) print "Label: " + str(Y_test[i]) error_rate = float(num_errors) / len(Y_test) print "Accuracy : " + str(1 - error_rate)
def main(): global X logging.info('Started') # pulling primary bill sponsor to match with party information sponsors_query = db.bills_details.find({}, {'_id': 1,'sponsors.leg_id':1,'sponsors.type':1,'sponsors.name':1, 'action_dates.signed': 1}).limit(25) #able to limit number of records for testing sponsors = list(sponsors_query) bill_party = [] # sponsors[0]['sponsors'][0] # Creates list of dict: bill database ID, passed status, legislator ID and party for i in range(len(sponsors)): bill_dbid = sponsors[i]['_id'] leg_id = sponsors[i]['sponsors'][0]['leg_id'] if leg_id == None: leg_id = 'CA0000' party = sponsors[i]['sponsors'][0]['name'] else: party = GetParty(leg_id) if party == None: party = sponsors[i]['sponsors'][0]['name'] if sponsors[i]['action_dates']['signed'] == None: bill_signed = False else: bill_signed = True k = ['id', 'leg_id', 'party','passed'] v = [bill_dbid, leg_id, party, bill_signed] bill_party.append(dict(zip(k,v))) logging.info('populated list of sponsor and party') # note to self/presentation: show number of bills sponsored by non-legislators # graph bills by party that passed ..... # Do I need to create/ update a dictionary? This pulls MongoDB_Id and texts # all_legtext = list(db.legtext.find({}, {'text': 1}).limit(25)) #adds vectorized features of bigrams using function # for i in range(len(bill_party)): # vec = GetBigramsVector(bill_party[i]['id']) # bill_party[i]['vec'] = vec # logging.info('loaded vectorized bigrams') bigram_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), token_pattern=r'\b\w+\b', min_df =1) analyze = bigram_vectorizer.build_analyzer() for i in range(len(bill_party)): #oid = bill_party[i]['id'] #print "Getting text for item", i, bill_party[i]['id'] leg_text = list(db.legtext.find({'_id': bill_party[i]['id']}, {'text': 1}))[0]['text'] raw = nltk.clean_html(leg_text) # bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) bigram_features = analyze(raw) bill_party[i]['features'] = bigram_features bill_party[i]['raw'] = raw # bill_party[i]['vec'] = bigram_vectorizer.fit_transform(bigram_features).toarray() party_options = {'democratic': 0, 'republican': 1} X = bigram_vectorizer.fit_transform([x['raw'] for x in bill_party if x['party'].lower() in party_options]) print bigram_vectorizer logging.info('loaded tfidf vectorized bigrams') # Creates numpy arrays, results = party and features = vectorized words # party only = democrat or republican and vectorized text bp_target = [] bp_data = [] for i in range(len(bill_party)): if bill_party[i]['party'].lower() in ('democratic', 'republican'): bp_target.append( party_options[bill_party[i]['party'].lower()] ) else: continue targets = np.array(bp_target) data = X.toarray() #===================================================================================== # Train different models - Linear, Logistic, Random Linear #===================================================================================== # Supported Vector Classification logging.info('Linear Support Vector Classification') clf = LinearSVC(loss='l2') print clf clf = clf.fit(data,targets) print 'LinearSVC Coef', clf.coef_ print 'LinearSVC Intercept', clf.intercept_ print 'LinearSVC Score/R2', clf.score(data,targets) with open('party_linearSVC.pkl', 'wb') as mclf: pickle.dump(clf, mclf) logging.info('output LinearSVC to party_linearSVC.pkl') X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, targets, test_size=0.4, random_state=0) clfCV = svm.SVC(kernel='linear', C=1).fit(X_train, y_train) print clfCV print 'training shape', X_train.shape, y_train.shape print 'testing shape', X_test.shape, y_test.shape print 'Test Score', clfCV.score(X_test, y_test) print 'Train Score', clfCV.score(X_train, y_train) # Logistic Regression logging.info('Logistic Regression') # Insert GridSearch Here logreg_l1 = linear_model.LogisticRegression(C=1.0, penalty='l1') logreg_l2 = linear_model.LogisticRegression(C=1.0, penalty='l2') logreg_l1.fit(data,targets) logreg_l2.fit(data,targets) print logreg_l1 print logreg_l2 print 'Pseudo-R2 penalty l1', logreg_l1.score(data,targets) print 'Pseudo-R2 penalty l2', logreg_l2.score(data,targets) print 'LogReg l1 Coef', logreg_l1.coef_ print 'LogReg l1 Intercept', logreg_l1.intercept_ with open('party_logreg_l1.pkl', 'wb') as lr1: pickle.dump(logreg_l1, lr1) logging.info('output Logistic regression to party_logreg_l1.pkl') with open('party_logreg_l2.pkl', 'wb') as lr2: pickle.dump(logreg_l2, lr2) logging.info('output Logistic regression to party_logreg_l2.pkl') # Random Forests # See other python file logging.info('Finished')
def build_analyzer(self): analyzer = TfidfVectorizer.build_analyzer(self) return lambda doc: (StemmedTfidfVectorizer.english_stemmer.stem(w) for w in analyzer(doc))
def main(): global X logging.info('Started') # pulling primary bill sponsor to match with party information sponsors_query = db.bills_details.find({}, {'_id': 1,'sponsors.leg_id':1,'sponsors.type':1,'sponsors.name':1, 'action_dates.signed': 1}) #able to limit number of records for testing sponsors = list(sponsors_query) bill_party = [] # sponsors[0]['sponsors'][0] # Creates list of dict: bill database ID, passed status, legislator ID and party for i in range(len(sponsors)): bill_dbid = sponsors[i]['_id'] leg_id = sponsors[i]['sponsors'][0]['leg_id'] if leg_id == None: leg_id = 'CA0000' party = sponsors[i]['sponsors'][0]['name'] else: party = GetParty(leg_id) if party == None: party = sponsors[i]['sponsors'][0]['name'] if sponsors[i]['action_dates']['signed'] == None: bill_signed = False else: bill_signed = True k = ['id', 'leg_id', 'party','passed'] v = [bill_dbid, leg_id, party, bill_signed] bill_party.append(dict(zip(k,v))) logging.info('populated list of sponsor and party') # note to self/presentation: show number of bills sponsored by non-legislators # graph bills by party that passed ..... # Do I need to create/ update a dictionary? This pulls MongoDB_Id and texts # all_legtext = list(db.legtext.find({}, {'text': 1}).limit(25)) #adds vectorized features of bigrams using function # for i in range(len(bill_party)): # vec = GetBigramsVector(bill_party[i]['id']) # bill_party[i]['vec'] = vec # logging.info('loaded vectorized bigrams') bigram_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), token_pattern=r'\b\w+\b', min_df =1) analyze = bigram_vectorizer.build_analyzer() for i in range(len(bill_party)): #oid = bill_party[i]['id'] #print "Getting text for item", i, bill_party[i]['id'] leg_text = list(db.legtext.find({'_id': bill_party[i]['id']}, {'text': 1}))[0]['text'] raw = nltk.clean_html(leg_text) # bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) bigram_features = analyze(raw) bill_party[i]['features'] = bigram_features bill_party[i]['raw'] = raw # bill_party[i]['vec'] = bigram_vectorizer.fit_transform(bigram_features).toarray() party_options = {'democratic': 0, 'republican': 1} X = bigram_vectorizer.fit_transform([x['raw'] for x in bill_party if x['party'].lower() in party_options]) print bigram_vectorizer logging.info('loaded tfidf vectorized bigrams') # Creates numpy arrays, results = party and features = vectorized words # party only = democrat or republican and vectorized text bp_target = [] bp_data = [] for i in range(len(bill_party)): if bill_party[i]['party'].lower() in ('democratic', 'republican'): bp_target.append( party_options[bill_party[i]['party'].lower()] ) else: continue targets = np.array(bp_target) data = X.toarray() #==================================================================================== # Random Forests Modeling and Plotting #=================================================================================== # Parameters n_classes = 2 n_estimators = 30 plot_colors = "ryb" cmap = pl.cm.RdYlBu plot_step = 0.02 # fine step width for decision surface contours plot_step_coarser = 0.5 # step widths for coarse classifier guesses RANDOM_SEED = 9 # fix the seed on each iteration ??? plot_idx = 1 models = [DecisionTreeClassifier(max_depth=None), RandomForestClassifier(n_estimators=n_estimators), ExtraTreesClassifier(n_estimators=n_estimators), AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=n_estimators)] for model in models: # We use all the features where the SKLEARN example choose specific ones X = data y = targets # Shuffle idx = np.arange(X.shape[0]) np.random.seed(RANDOM_SEED) np.random.shuffle(idx) X = X[idx] y = y[idx] # Standardize mean = X.mean(axis=0) std = X.std(axis=0) X = (X - mean) / std # Train clf = clone(model) clf = model.fit(X, y) scores = clf.score(X, y) # Create a title for each column and the console by using str() and # slicing away useless parts of the string model_title = str(type(model)).split(".")[-1][:-2][:-len("Classifier")] model_details = model_title if hasattr(model, "estimators_"): model_details += " with {} estimators".format(len(model.estimators_)) print model_details + " with all features has a score of", scores ###################### Commented out plotting ############################################ # pl.subplot(3, 4, plot_idx) # if plot_idx <= len(models): # # Add a title at the top of each column # pl.title(model_title) # # Now plot the decision boundary using a fine mesh as input to a # # filled contour plot # x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 # y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 # xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), # np.arange(y_min, y_max, plot_step)) # # Plot either a single DecisionTreeClassifier or alpha blend the # # decision surfaces of the ensemble of classifiers # if isinstance(model, DecisionTreeClassifier): # Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) # Z = Z.reshape(xx.shape) # cs = pl.contourf(xx, yy, Z, cmap=cmap) # else: # # Choose alpha blend level with respect to the number of estimators # # that are in use (noting that AdaBoost can use fewer estimators # # than its maximum if it achieves a good enough fit early on) # estimator_alpha = 1.0 / len(model.estimators_) # for tree in model.estimators_: # Z = tree.predict(np.c_[xx.ravel(), yy.ravel()]) # Z = Z.reshape(xx.shape) # cs = pl.contourf(xx, yy, Z, alpha=estimator_alpha, cmap=cmap) # # Build a coarser grid to plot a set of ensemble classifications # # to show how these are different to what we see in the decision # # surfaces. These points are regularly space and do not have a black outline # xx_coarser, yy_coarser = np.meshgrid(np.arange(x_min, x_max, plot_step_coarser), # np.arange(y_min, y_max, plot_step_coarser)) # Z_points_coarser = model.predict(np.c_[xx_coarser.ravel(), yy_coarser.ravel()]).reshape(xx_coarser.shape) # cs_points = pl.scatter(xx_coarser, yy_coarser, s=15, c=Z_points_coarser, cmap=cmap, edgecolors="none") # # Plot the training points, these are clustered together and have a # # black outline # for i, c in zip(xrange(n_classes), plot_colors): # idx = np.where(y == i) # pl.scatter(X[idx, 0], X[idx, 1], c=c, label=iris.target_names[i], # cmap=cmap) # plot_idx += 1 # move on to the next plot in sequence # pl.suptitle("Classifiers on feature subsets of the Party Words dataset") # pl.axis("tight") # pl.show() logging.info('Finished')
tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train); tfidf_test = tfidftransformer.fit(counts_test).transform(counts_test); #或者让两个tf-idf共享vocabulary #method 2:TfidfVectorizer print '*************************\nTfidfVectorizer\n*************************' from sklearn.feature_extraction.text import TfidfVectorizer tv = TfidfVectorizer(sublinear_tf = True, max_df = 0.5, stop_words = 'english'); tfidf_train_2 = tv.fit_transform(newsgroup_train.data); tv2 = TfidfVectorizer(vocabulary = tv.vocabulary_); tfidf_test_2 = tv2.fit_transform(newsgroups_test.data); print "the shape of train is "+repr(tfidf_train_2.shape) print "the shape of test is "+repr(tfidf_test_2.shape) analyze = tv.build_analyzer() tv.get_feature_names()#statistical features/terms #(准确率*召回率)/(准确率+召回率) def calculate_result(actual,pred): m_precision = metrics.precision_score(actual,pred); m_recall = metrics.recall_score(actual,pred); print 'predict info:' print 'precision:{0:.3f}'.format(m_precision) print 'recall:{0:0.3f}'.format(m_recall); print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual,pred)); #或者sklearn里封装好的抓feature函数,fetch_20newsgroups_vectorized print '*************************\nfetch_20newsgroups_vectorized\n*************************' from sklearn.datasets import fetch_20newsgroups_vectorized tfidf_train_3 = fetch_20newsgroups_vectorized(subset = 'train');
data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) """ print 'data loaded' import conversions as conv from ersatzpg.utffile import utffile special_terms = [] vocabulary = [] basic_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, use_idf=False, stop_words='english') basic_analyze = basic_vectorizer.build_analyzer() with utffile('searchterms.csv') as f: for s in f: if s.startswith('<'): special_terms.append(s.strip('<>')) else: vocabulary.append(s.decode('utf-8').strip()) def analyze(s): d = eval(s) special_keys = [] name = d['name'] electoral_district_type = d['electoral_district_type'] electoral_district_name = d['electoral_district_name'] state = d['state']
def build_analyzer(self): # 注释掉即成为普通TfidfVectorizer # analyzer = super(StemmedTfidfVectorizer, self).build_analyzer() analyzer = TfidfVectorizer.build_analyzer(self) return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
y_test = np.array([el for el in nyt_labels[trainset_size + 1:len(nyt_labels)]]) #print(X_train) vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2') test_string = unicode(nyt_data[0]) print "Example string: " + test_string print "Preprocessed string: " + vectorizer.build_preprocessor()(test_string) print "Tokenized string:" + str(vectorizer.build_tokenizer()(test_string)) print "N-gram data string:" + str(vectorizer.build_analyzer()(test_string)) X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) svm_classifier = LinearSVC().fit(X_train, y_train) y_svm_predicted = svm_classifier.predict(X_test)
# In[40]: from sklearn.metrics.pairwise import cosine_similarity # A short example using the sentences above words_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2, stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3)) get_ipython().magic(u'time words_matrix = words_vectorizer.fit_transform(sents) #fit the vectorizer to synopses') # (2, 18) means the matrix has 2 rows (two sentences) and 18 columns (18 terms) print(words_matrix.shape) print(words_matrix) # this is how we get the 18 terms analyze = words_vectorizer.build_analyzer() print(analyze("Today (May 19, 2016) is his only daughter's wedding.")) print(analyze("Vito Corleone is the Godfather.")) print(analyze("Vito's youngest son, Michael, in a Marine Corps uniform, introduces his girlfriend, Kay Adams, to his family at the sprawling reception.")) all_terms = words_vectorizer.get_feature_names() print(all_terms) print(len(all_terms)) # sent 1 and 2, similarity 0, sent 1 and 3 shares "his", sent 2 and 3 shares Vito - try to change Vito's in sent3 to His and see the similary matrix changes example_similarity = cosine_similarity(words_matrix) example_similarity # Now onto the fun part. Using the tf-idf matrix, you can run a slew of clustering algorithms to better understand the hidden structure within the synopses. I first chose k-means. K-means initializes with a pre-determined number of clusters (I chose 5). Each observation is assigned to a cluster (cluster assignment) so as to minimize the within cluster sum of squares. Next, the mean of the clustered observations is calculated and used as the new cluster centroid. Then, observations are reassigned to clusters and centroids recalculated in an iterative process until the algorithm reaches convergence. # # I found it took several runs for the algorithm to converge a global optimum as k-means is susceptible to reaching local optima - how to decide that the algorithm converged???