def file(): cats = ['alt.atheism', 'sci.electronics'] newsgroups_train = fetch_20newsgroups(subset='train', categories=cats) newsgroups_test = fetch_20newsgroups(subset='test', categories=cats) vectorizer = TfidfVectorizer() #把所有文档都切词,统计了 vectors_train = vectorizer.fit_transform(newsgroups_train.data) vectors = vectorizer.transform(newsgroups_test.data) print vectors.shape[1] #f=open('test_all.txt','wb') for j in range(0, vectors.shape[0]): item_id = list() tokens = vectorizer.build_tokenizer()(newsgroups_test.data[j]) #提取分词结果 #print tokens word_sort = np.argsort(-vectors[j].data) print '顶点' + str(j) for i in range(0, len(word_sort)): word = vectorizer.get_feature_names()[vectors[j].indices[ word_sort[i]]] #这个是tf-idf詞 for line in range(0, len(tokens)): if tokens[line].lower() == word: item_id.append((line, word_sort[i])) pos_item = sorted(item_id, key=lambda jj: jj[0], reverse=True) #抽取tf-idf词 word_word = np.zeros([len(word_sort), len(word_sort)]) for p in range(0, len(pos_item)): if p < (len(pos_item) - 1): ki = word_sort[pos_item[p][1]] kj = word_sort[pos_item[p + 1][1]] word_word[ki, kj] = word_word[ki, kj] + 1
def tokenize_query(query, ds, vocab_to_ix, words_compressed, docs_compressed, ATN_word_to_ix): """ Returns a dictionary with structure {term : frequency}. Also preprocesses the input query string using the Sklearn TfidfVectorizer. """ print >> sys.stderr, "tokenize_query" helper = TfidfVectorizer(min_df=3, stop_words='english', dtype=np.int16) tfidf_preprocessor = helper.build_preprocessor() tfidf_tokenizer = helper.build_tokenizer() prepro_q = tfidf_preprocessor(query) q_tokens = tfidf_tokenizer(prepro_q) gc.collect() query_dict_ix = defaultdict(int) query_dict_term = defaultdict(int) for tok in q_tokens: tfidf_vocab_ix = vocab_to_ix.get(tok, -1) if tfidf_vocab_ix != -1: query_dict_ix[vocab_to_ix[tok]] += 1 query_dict_term[tok] += 1 print >> sys.stderr, "lending control to expand query" expanded_query_dict = expand_query(query_dict_ix, query_dict_term, vocab_to_ix, \ words_compressed, docs_compressed, ATN_word_to_ix) gc.collect() return expanded_query_dict
def file(): cats = ["alt.atheism", "sci.electronics"] newsgroups_train = fetch_20newsgroups(subset="train", categories=cats) newsgroups_test = fetch_20newsgroups(subset="test", categories=cats) vectorizer = TfidfVectorizer() # 把所有文档都切词,统计了 vectors_train = vectorizer.fit_transform(newsgroups_train.data) vectors = vectorizer.transform(newsgroups_test.data) print vectors.shape[1] # f=open('test_all.txt','wb') for j in range(0, vectors.shape[0]): item_id = list() tokens = vectorizer.build_tokenizer()(newsgroups_test.data[j]) # 提取分词结果 # print tokens word_sort = np.argsort(-vectors[j].data) print "顶点" + str(j) for i in range(0, len(word_sort)): word = vectorizer.get_feature_names()[vectors[j].indices[word_sort[i]]] # 这个是tf-idf詞 for line in range(0, len(tokens)): if tokens[line].lower() == word: item_id.append((line, word_sort[i])) pos_item = sorted(item_id, key=lambda jj: jj[0], reverse=True) # 抽取tf-idf词 word_word = np.zeros([len(word_sort), len(word_sort)]) for p in range(0, len(pos_item)): if p < (len(pos_item) - 1): ki = word_sort[pos_item[p][1]] kj = word_sort[pos_item[p + 1][1]] word_word[ki, kj] = word_word[ki, kj] + 1
def tokenize_query(query, ds): """ Returns a dictionary with structure {term : frequency}. Also preprocesses the input query string using the Sklearn TfidfVectorizer. """ print >> sys.stderr, "tokenize_query" helper = TfidfVectorizer(min_df=3, stop_words='english', dtype=np.int16) tfidf_preprocessor = helper.build_preprocessor() tfidf_tokenizer = helper.build_tokenizer() with open(os.path.join(BASE, os.path.join(ds, 'vocab_to_ix.json'))) as f: vocab_to_ix = json.load(f) prepro_q = tfidf_preprocessor(query) q_tokens = tfidf_tokenizer(prepro_q) gc.collect() query_dict_ix = defaultdict(int) query_dict_term = defaultdict(int) for tok in q_tokens: tfidf_vocab_ix = vocab_to_ix.get(tok, -1) if tfidf_vocab_ix != -1: query_dict_ix[vocab_to_ix[tok]] += 1 query_dict_term[tok] += 1 expanded_query_dict = expand_query(query_dict_ix, query_dict_term, vocab_to_ix) f.close() gc.collect() return expanded_query_dict
def vectorize_reu_iden(): helper = TfidfVectorizer(min_df=3, stop_words='english', dtype=np.int16) tfidf_preprocessor = helper.build_preprocessor() tfidf_tokenizer = helper.build_tokenizer() news = pd.read_csv('data/reu_identifiers.csv', names=['date', 'id', 'title'], usecols=['id', 'title']) news = news[news['title'].isnull() == False] news = news[2283884:] #2016 on news.reindex(labels=np.arange(len(news))) gc.collect() article_tf = {} doc_freq = defaultdict(lambda: 0) unique_toks = set() for ix, story in news.iterrows(): tf_dict = defaultdict(lambda: 0) tokens = tfidf_tokenizer(story['title']) story_unique_toks = set(tokens) for tok in tokens: tf_dict[tok] += 1 for tok in story_unique_toks: unique_toks.add(tok) doc_freq[tok] += 1 article_tf[story['id']] = tf_dict gc.collect() return article_tf, doc_freq, unique_toks
def corpusweights(s): records = s.maC.getRecords() tfidf = [] topics = [] corpus = [] for record in records: if (record['domain'] == s.domain): topics.append(record['topic']) corpus.append(record['body']) if s.verbose: s.logger.info("corpusweights : topics : " + str(topics) + " len(corpus) : " + str(len(corpus))) tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), stop_words='english', lowercase=True, token_pattern='[A-Za-z]{2,}') tokenize = tf.build_tokenizer() tfidf_matrix = tf.fit_transform(corpus) feature_names = tf.get_feature_names() dense_tfidf_matrix = tfidf_matrix.todense() for i in range(0, len(dense_tfidf_matrix)): topic = dense_tfidf_matrix[i].tolist()[0] # filter out phrases with a score of 0 phrase_scores = [ pair for pair in zip(range(0, len(topic)), topic) if pair[1] > 0 ] sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1) # find the min and max score for normalization by grabbing the scores of # the first and last elements in the sorted list max_score = sorted_phrase_scores[0][1] min_score = sorted_phrase_scores[len(sorted_phrase_scores) - 1][1] tfidf.append(dict({'topic': topics[i], 'phrases': []})) for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores ]: # normalize scores to a 0 to 1 range normalized_score = (score - min_score) / (max_score - min_score) normalized_score = 1 - 100**( -1 * normalized_score ) #(score - min_score) / (max_score - min_score) tfidf[i]['phrases'].append( dict({ 'phrase': phrase, 'score': normalized_score })) return tfidf
def create_vocab(train): init_time = time.time() vocab = set() t = TfidfVectorizer() tokenizer = t.build_tokenizer() for ex in train[0]: vocab.update(tokenizer(ex)) end_time = time.time() print("it took " + str(end_time - init_time) + "to create the vocabulary") return vocab
def tokenize(observations): vectorizer = TfidfVectorizer( strip_accents='unicode', lowercase=True, analyzer='word', ) tokenizer = vectorizer.build_tokenizer() observations['aft_comment'] = observations['aft_comment'].astype(str) tokenized_text = observations['aft_comment'].apply(tokenizer).values return tokenized_text
class TfidfTokenizerWrapper(AbstractTokenizer): def __init__(self): self.vectorizer = TfidfVectorizer(stop_words='english') self.tokenizer = self.vectorizer.build_tokenizer() def tokenize(self, text): return self.tokenizer(text) def convert_tokens_to_ids(self, text_list): return self.vectorizer.fit_transform( (' '.join(tokenized_text) for tokenized_text in text_list)).toarray()
class Analyzer(object): def __init__(self): self.tfidf = TfidfVectorizer(min_df=1, binary=False, ngram_range=(1, 3), tokenizer=Tokenizer()) self.tokens = self.tfidf.build_tokenizer() self.ngram = self.tfidf.build_analyzer() def __call__(self, sentence): ret = self.ngram(sentence) terms = self.tokens(sentence) for term in terms: cate = term_category(term) if term != cate: ret.append(cate) return ret
class Vectorizer(object): def __init__(self): self.count_vec = TfidfVectorizer(binary = True, ngram_range = (1, 3), tokenizer = Tokenizer()) self.last_vec = CountVectorizer(binary = True, ngram_range = (1, 1), tokenizer = Tokenizer()) def collect_last_term(self, X): X_last = list() tokens = self.last_vec.build_tokenizer() _logger.debug("Extracting last term for each sentence") for sent in X: X_last.append(tokens(sent)[-1]) _logger.debug("Fitting last-term vectorizer") return X_last def fit(self, X, y = None): _logger.debug("Fitting count vectorizer") self.count_vec.fit(X) X_last = self.collect_last_term(X) self.last_vec.fit(X_last) return self def transform(self, X, y = None): #return self.count_vec.transform(X) _logger.debug("Doing tfidf transform") Xc = self.count_vec.transform(X) X_last = self.collect_last_term(X) _logger.debug("Doing last term transform") Xl = self.last_vec.transform(X_last) _logger.debug("stacking features") ret = sparse.hstack([Xc, Xl]) tokens = self.count_vec.build_tokenizer() l = list() for sent in X: terms = tokens(sent) l.append(1 if ("__LOCATION__" in terms and "__ORGNIZATION__" in terms) else 0) l = np.array(l) l.shape = len(l), 1 ret = sparse.hstack([ret, l]) _logger.debug("vectorization transform done") return ret
def find_tfidf(self): ''' pre-calculate tfidf ''' print('Finding tfidf...') stop_words = set(stopwords.words('english')) vectorizer = TfidfVectorizer(lowercase=False, ngram_range=self.ngrams, norm='l2', smooth_idf=True, stop_words=stop_words, min_df=2, max_df=0.8) data = self.data[self.description].apply(self.remove_html) self.tfidf = vectorizer.fit_transform(data) self.tfidf_indices = vectorizer.get_feature_names() self.tokenizer = vectorizer.build_tokenizer()
def tfidf_sequential_model(data, only_overall=True, **kwargs): X = field_array(data, 'Text') X.append(' '.join(X)) tfidfer = TfidfVectorizer(**kwargs) tfidfer.fit(X) values = tfidfer.transform(X) if only_overall: values = values[-1] terms = tfidfer.get_feature_names() n = len(terms) s = sorted(range(n), key=lambda k: values[0,k], reverse=True) for i in range(n): print('%s\t%s\t%s' % (terms[s[i]], values[0,s[i]], (i+1) / n)) return { 'terms': terms, 'tokenizer': tfidfer.build_tokenizer(), 'values': values, }
def tokenize_query(query): helper = TfidfVectorizer(min_df=3, stop_words='english', dtype=np.int16) tfidf_preprocessor = helper.build_preprocessor() tfidf_tokenizer = helper.build_tokenizer() with open(os.path.join(os.path.dirname(__file__), 'reuters/vocab_to_ix.json')) as f: #vocab_to_ix = json.load(open('vocab_to_ix.json')) vocab_to_ix= json.load(f) prepro_q = tfidf_preprocessor(query) q_tokens = tfidf_tokenizer(prepro_q) gc.collect() query_dict = defaultdict(int) for tok in q_tokens: tfidf_vocab_ix = vocab_to_ix.get(tok, -1) if tfidf_vocab_ix != -1: query_dict[vocab_to_ix[tok]] += 1 f.close() gc.collect() return query_dict
def transform_cnn_data(self, X_raw, feat_and_param): feat_and_param['feats']['ngram_range'] = (1,1) feat_and_param['feats']['use_idf'] = False feat_and_param['feats']['binary'] = False vectorizer = TfidfVectorizer(**feat_and_param['feats']) vectorizer.fit(X_raw) tokenizer = TfidfVectorizer.build_tokenizer(vectorizer) X_raw_tokenized = [tokenizer(ex) for ex in X_raw] train_X = [] for example in X_raw_tokenized: for i in range(len(example)): example[i] = re.sub(r"[^A-Za-z0-9(),!?\'\`]", "", example[i]) train_X.append([vectorizer.transform(example)]) index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()} for key in index_to_word: index_to_word[key] = re.sub(r"[^A-Za-z0-9(),!?\'\`]", "", index_to_word[key]) return train_X, index_to_word
def transform_cnn_data(self, X_raw, feat_and_param): #DEBUGGING feat_and_param['feats']['ngram_range'] = (1,1) feat_and_param['feats']['use_idf'] = False feat_and_param['feats']['binary'] = False vectorizer = TfidfVectorizer(**feat_and_param['feats']) vectorizer.fit(X_raw) tokenizer = TfidfVectorizer.build_tokenizer(vectorizer) X_raw_tokenized = [tokenizer(ex) for ex in X_raw] train_X = [] for example in X_raw_tokenized: for i in range(len(example)): example[i] = re.sub(r"[^A-Za-z0-9(),!?\'\`]", "", example[i]) train_X.append([vectorizer.transform(example)]) index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()} #for key in index_to_word: # index_to_word[key] = re.sub(r"[^A-Za-z0-9(),!?\'\`]", "", index_to_word[key]) return train_X, index_to_word
class CleanAndVectorize(object): def __init__(self, **kwargs): max_df = kwargs.get('max_df', .9) max_features = kwargs.get('max_features', 1000) self.vectorizer = TfidfVectorizer(strip_accents='unicode', lowercase=True, analyzer='word', max_df=max_df, max_features=max_features) self.tokenizer = self.vectorizer.build_tokenizer() self.cols_to_extract = [ 'aft_id', 'aft_page', 'aft_page_revision', 'aft_user', 'aft_user_text', 'aft_comment', 'aft_noaction', 'aft_inappropriate', 'aft_helpful', 'aft_unhelpful', 'aft_rating' ] def process(self, observations, save_tokens=False, remove_zero=True, debug=False, add_rating=False): if debug: observations = observations.sample(debug) observations = observations[self.cols_to_extract] observations['aft_comment'] = observations['aft_comment'].astype(str) observations['aft_net_sign_helpful'] = np.sign( observations['aft_helpful'] - observations['aft_unhelpful']).astype(int) if remove_zero: observations = observations.loc[ observations['aft_net_sign_helpful'] != 0] if save_tokens: observations['tokenized_text'] = observations['aft_comment'].apply( self.tokenizer) #observations['feature_vector'] = self.vectorizer.fit_transform(observations['aft_comment'].values).toarray().tolist() feature_vectors = self.vectorizer.fit_transform( observations['aft_comment'].values) if add_rating: feature_vectors = hstack( (feature_vectors, observations['aft_rating'].values[:, None])) return observations, feature_vectors
def process_joke(joke): data = {} # Lowercase text. joke.text = joke.text.lower() # Replace text with dict. stop_words = set(stopwords.words('english')) vectorizer = TfidfVectorizer() tokenizer = vectorizer.build_tokenizer() def tokenize_text(text, prefix=''): d = {} for term in tokenizer(text): if term in stop_words: continue d[prefix + term] = d.get(prefix + term, 0) + 1 return d data.update(tokenize_text(joke.text, 't_')) data.update({('cat_' + cat): 1 for cat in joke.categories}) data.update({('subcat_' + cat): 1 for cat in joke.subcategories}) return data
def main(): vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=False, ngram_range=(1, 1), token_pattern='\\b\\w+\\b') #, tokenizer=StemTokenizer()) vct_analizer = vct.build_tokenizer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = 10 # max(10, args.fixk) args.fixk = None data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) ### SENTENCE TRANSFORMATION sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') ## delete <br> to "." to recognize as end of sentence data.train.data = experiment_utils.clean_html(data.train.data) data.test.data = experiment_utils.clean_html(data.test.data) print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0])) ## Get the features of the sentence dataset ## create splits of data: pool, test, oracle, sentences expert_data = Bunch() train_test_data = Bunch() expert_data.sentence, train_test_data.pool = split_data(data.train) expert_data.oracle, train_test_data.test = split_data(data.test) data.train.data = train_test_data.pool.train.data data.train.target = train_test_data.pool.train.target data.test.data = train_test_data.test.train.data data.test.target = train_test_data.test.train.target ## convert document to matrix data.train.bow = vct.fit_transform(data.train.data) data.test.bow = vct.transform(data.test.data) #### EXPERT CLASSIFIER: ORACLE print("Training Oracle expert") labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector) expert_data.oracle.train.data = sent_train expert_data.oracle.train.target = np.array(labels) expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data) exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty) exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target) #### EXPERT CLASSIFIER: SENTENCES print("Training sentence expert") labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector) expert_data.sentence.train.data = sent_train expert_data.sentence.train.target = np.array(labels) expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data) sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty) sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target) #### TESTING THE CLASSIFERS test_target, test_data = split_data_sentences(data.test,sent_detector) test_data_bow = vct.transform(test_data) #pred_sent = sent_clf.predict(test_data_bow) pred_ora = exp_clf.predict(test_data_bow) y_probas = sent_clf.predict_proba(test_data_bow) pred_sent = sent_clf.classes_[np.argmax(y_probas, axis=1)] ## just based on one class probability # order = np.argsort(y_probas[:,0]) order = np.argsort(y_probas.max(axis=1)) print "ORACLE\tSENTENCE\tMAX-SENT" # for i in order[:500]: # print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i] for i in order[-500:]: print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i] print "Accuracy of Sentences Classifier", metrics.accuracy_score(test_target, pred_sent) print "Class distribution: %s" % pred_sent.sum() print "Size of data: %s" % pred_sent.shape[0] sizes = [50, 100, 500, 1000, 2000, 3000, 4000, 20000] clf = linear_model.LogisticRegression(penalty='l1', C=1) bootstrap = rand.permutation(len(test_data)) x = [] y = [] for s in sizes: indices = bootstrap[:s] train_x = expert_data.sentence.train.bow[indices[:s]] train_y = expert_data.sentence.train.target[indices[:s]] clf.fit(train_x, train_y) predictions = clf.predict(test_data_bow) scores = metrics.accuracy_score(test_target,predictions) ## print clf.__class__.__name__ print "Accuracy {0}: {1}".format(s, scores) y.append(scores) plt.clf() plt.title("Accuracy") plt.xlabel("Labels") plt.ylabel("Accuracy") plt.legend() plt.plot(sizes, y, '--bo', label="sent") plt.show()
class FeatureExtractor: vectorizer = None feature_names = None feature_matrix = None def train_extractor_from_lines(self, train_lines, labels, test_lines): self.vectorizer = TfidfVectorizer(tokenizer=tokenizer, max_features=DISTINCT_WORDS_CNT) self.vectorizer.fit(train_lines + test_lines) pass def load_vectorizer(self): input_file = open('../models/tfidf_vectorizer.pkl', 'rb') self.vectorizer = pickle.load(input_file) input_file.close() pass def save_vectorizer(self): output_file = open('../models/tfidf_vectorizer.pkl', 'wb') pickle.dump(self.vectorizer, output_file) output_file.close() pass def train_extractor(self, full = False): if not full: train_lines = file2lines('../data/train_lite.csv') labels = file2labels('../data/train_lite.csv') test_lines = file2lines('../data/test_lite.csv') else: train_lines = file2lines('../data/train.csv') labels = file2labels('../data/train.csv') test_lines = file2lines('../data/test.csv') self.train_extractor_from_lines(train_lines, labels, test_lines) pass def lines2words(self, lines): self.tokenizer = self.vectorizer.build_tokenizer() return [self.tokenizer(line) for line in lines] def lines2features(self, lines, use_tense = False): """ returns DataFrame(feature_matrix, feature_name) ['word_rainny', 'word_'sunny'], array([ [1, 0.4, 0.2], [0.2, 1, 0.2], ]) """ self.feature_names = [] self.feature_matrix = None # tf-idf features data = self.vectorizer.transform(lines).toarray() self.feature_names = self.vectorizer.get_feature_names() self.feature_matrix = data # additional features add_features = [] important_words = ['sunny', 'wind', 'humid', 'hot', 'cold', 'dry', 'ice', 'rain', 'snow', 'tornado', 'storm', 'hurricane'] important_words = ['cloud', 'cold', 'dry', 'hot', 'humid', 'hurricane', 'ice', 'rain', 'snow', 'storm', 'sunny', 'tornado', 'wind'] self.feature_names = self.feature_names + ['impt_words:' + word for word in important_words] if use_tense: self.feature_names = self.feature_names + ['past_tense_num', 'present_tense_num'] all_words = self.lines2words(lines) for words in all_words: # important words important_words_ftr = [int(word in words) for word in important_words] add_features.append(important_words_ftr) # tense if use_tense: tagz = zip(*nltk.pos_tag(nltk.word_tokenize(words)))[1] past_num = len([v for v in tagz if v == 'VBD']) present_num = len([v for v in tagz if v in ['VBP', 'VB']]) add_features.append([past_num, present_num]) self.feature_matrix = np.hstack((self.feature_matrix, add_features)) return DataFrame(self.feature_matrix, columns = self.feature_names)
count_pos_test = count_neg_test + 1 label_test = test_data[:,1] #vctr = CountVectorizer(stop_words='english',min_df = 1) #vctr2 = HashingVectorizer(stop_words='english') vctr = TfidfVectorizer(stop_words='english') #intailising vectorizers TF-IDF gives better accuracy by 1 percent compared to the other vectors count_pos = 0 count_neg = 0 ###################################################################################################### train = [] test = [] for i in range(len(train_data)): #processing of the train data string = train_data[i,0] string = vctr.build_preprocessor()(string.lower()) string = vctr.build_tokenizer()(string.lower()) train.append(' '.join(string)) for i in range(len(test_data)): #processing of the test data string = test_data[i,0] string = vctr.build_preprocessor()(string.lower()) string = vctr.build_tokenizer()(string.lower()) test.append(' '.join(string)) ###################################################################################################### train_data1 = vctr.fit_transform(train).toarray() #fitting the dictionary for bag of words model using TF-IDF vectorizers #X_test = vctr.transform(test).toarray() y_train = np.asarray(label_train, dtype="|S6") y_train = y_train.astype(int) clf1 = GradientBoostingClassifier(n_estimators = 500) #initialising classifiers clf2 = AdaBoostClassifier(n_estimators = 500)
self.wnl = WordNetLemmatizer() def __call__(self, doc): doc = str(doc) s = "".join(doc.split("__EOS__")) doc = s.translate(None, string.punctuation) tokens = doc.word_tokenize(doc) bi = list(p1+" "+p2 for p1,p2 in nltk.bigrams(tokens)) tokens.extend(bi) return [self.wnl.lemmatize(t) for t in tokens] if _use_TFIDF_ : #vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1,2), min_df=1, stop_words=stoplist, max_features=no_of_features, tokenizer=LemmaTokenizer()) vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1,2), min_df=1, stop_words=stoplist, max_features=no_of_features) func_tokenizer =vectorizer.build_tokenizer() ''' I was using two functions earlier for tokenization and data preprocessing. Later implemented the LemmaTokenizer class for this. ''' def ispunct(some_string): return not any(char.isalnum() for char in some_string) def get_tokens(s): # Tokenize into words in sentences. Returns list of strs retval = [] sents = sent_tokenize(s) for sent in sents:
# -*- coding: utf-8 -*- from gensim import corpora, models, matutils from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer from nltk.stem import WordNetLemmatizer dataset=fetch_20newsgroups(categories=['alt.atheism','talk.religion.misc','sci.space']) # berem toko 3 categorii vect = TfidfVectorizer() tok=vect.build_tokenizer() # хорошо токенизирует все texts=[] lem=WordNetLemmatizer() lemms=[] #for text in dataset.data: # for token in tok(text): # lemms.append(lem.lemmatize(token)) # texts.append(lemms) #models = models.Word2Vec(texts,size=100, window=5,min_count=5,workers=4) #models.save('texts.dat') model = models.Word2Vec.load('texts.dat') #print(model['theory']) #print(model.similarity('man','car')) #print(model.most_similar(positive=['man'],negative=['computer'])) print model.doesnt_match("car wheel glass engine".split())
class SplitVectorizer(): def __init__(self, tfidf_model=None, input_file_name=None, type_analyzer='word', n_gram_range=(1, 2), Xy='X', vectorize=False): if tfidf_model == None: assert input_file_name != None # Give model or input text self.model = TfidfVectorizer(analyzer=type_analyzer, ngram_range=n_gram_range) elif input_file_name == None: assert tfidf_model != None # Give model or input text self.model = tfidf_model elif not None in [input_file_name, tfidf_model]: self.model = tfidf_model self.XY = Xy self.input_file = input_file_name self.vectorize = vectorize def fit(self, X=None, y=None): with open(self.input_file) as f: self.model.fit(f) self.analyzer = self.model.build_analyzer() self.prep = self.model.build_preprocessor() self.tokenizer = self.model.build_tokenizer() self.vocab = {self.model.vocabulary_[w]: w for w in self.model.vocabulary_} return self def get_matrices(self): self.docs_X = [] self.docs_Y = [] for a in open(self.input_file): x = self.tokenizer(self.prep(a)) dl = len(x) self.docs_X.append(" ".join(x[:int(dl/2)])) self.docs_Y.append(" ".join(x[int(dl/2):])) return self.model.transform(self.docs_X), \ self.model.transform(self.docs_Y) def Tx(self, x): if self.vectorize: return self.model.transform([x]) else: return self.analyzer(x) def __iter__(self): for a in open(self.input_file): x = self.tokenizer(self.prep(a)) dl = len(x) if self.XY == 'X': yield self.Tx(" ".join(x[:int(dl/2)])) elif self.XY == 'Y': yield self.Tx(" ".join(x[int(dl/2):])) elif self.XY == 'join': yield self.Tx(" ".join(x[:int(dl/2)])), \ self.Tx(" ".join(x[int(dl/2):]))
class CleanAndVectorize(object): def __init__(self, en_kvs_path, **kwargs): max_df = kwargs.get('max_df', .9) max_features = kwargs.get('max_features', 1000) self.tfidf_vectorizer = TfidfVectorizer(strip_accents='unicode', lowercase=True, analyzer='word', max_df=max_df, max_features=max_features) self.w2v_vectorizer = KeyedVectors.load(en_kvs_path, mmap='r') self.tokenizer = self.tfidf_vectorizer.build_tokenizer() self.cols_to_extract = [ 'aft_id', 'aft_page', 'aft_page_revision', 'aft_user', 'aft_user_text', 'aft_comment', 'aft_noaction', 'aft_inappropriate', 'aft_helpful', 'aft_unhelpful', 'aft_rating' ] def get_token_vector(self, token): if token in self.w2v_vectorizer: return self.w2v_vectorizer[token] else: return np.zeros(self.w2v_vectorizer.vector_size) def get_sentence_vector(self, token_list): vector_list = np.array([self.get_token_vector(x) for x in token_list]) sentence_vector = np.mean(vector_list, axis=0) return sentence_vector def get_feature_vector(self, observation, add_rating=False): feature_vector = self.get_sentence_vector( observation['tokenized_text']) if add_rating: feature_vector = np.append(feature_vector, observation['aft_rating']) feature_vector = feature_vector.tolist() return feature_vector def process(self, observations, save_tokens=False, remove_zero=True, debug=False, add_rating=False): if debug: observations = observations.sample(debug) observations = observations[self.cols_to_extract] observations['aft_comment'] = observations['aft_comment'].astype(str) observations['aft_net_sign_helpful'] = np.sign( observations['aft_helpful'] - observations['aft_unhelpful']).astype(int) if remove_zero: observations = observations.loc[ observations['aft_net_sign_helpful'] != 0] observations['tokenized_text'] = observations['aft_comment'].apply( self.tokenizer) observations = observations.loc[ observations['tokenized_text'].apply(len) > 0] observations['feature_vector'] = observations[[ 'tokenized_text', 'aft_rating' ]].apply(self.get_feature_vector, axis=1, add_rating=add_rating) if not save_tokens: observations.drop(labels='tokenized_text', axis=1, inplace=True) return observations
class NLPPipeline(): def __init__(self, text, Y, train_size=.85): self.model_builders = {'dtc': dtc, 'rfc': rfc} steps = ['tfidf', 'feature_engineering', 'lda', 'model'] self.pipeline_dic = {step: None for step in steps} self.text_train, self.text_test, self.Y_train, self.Y_test = split( text, Y, train_size=train_size, stratify=Y) self.keep_tfidf = lambda tfidf_dic: (tfidf_dic == self.pipeline_dic[ 'tfidf']) self.keep_features = lambda features_dic: (features_dic == self. pipeline_dic['features']) self.prob_info = lambda prob: -prob * np.log(prob) self.pipeline_dic = {step: "Default" for step in steps} self.train_size = train_size def update_tfidf(self, tfidf_dic): self.pipeline_dic['tfidf'] = tfidf_dic self.tfidf = TfidfVectorizer(**tfidf_dic) self.tfidf_train = self.tfidf.fit_transform(self.text_train) self.tfidf_train = self.tfidf_train.toarray() self.tokenizer = self.tfidf.build_tokenizer() self.tfidf_test = self.tfidf.transform(self.text_test) self.tfidf_test = self.tfidf_test.toarray() self.feature_names = self.tfidf.get_feature_names() def update_lda(self, lda_dic): def calc_topics_words(num_top_words): topics_words = [] for ix, topic in enumerate(self.lda.components_): top_word_inds = topic.argsort()[:-num_top_words - 1:-1] topic_words = set( [self.feature_names[i] for i in top_word_inds]) topics_words.append(topic_words) return topics_words num_top_words = lda_dic[ 'num_top_words'] if 'num_top_words' in lda_dic else 10 lda_model_dic = { k: v for k, v in lda_dic.items() if k != 'num_top_words' } self.lda = LDA(**lda_model_dic) self.lda.fit_transform(self.tfidf_train) self.topics_words = calc_topics_words(num_top_words) def calc_entropy(self, text): word_counts = defaultdict(int) text_size = float(len(text)) for word in text: word_counts[word] += 1 word_counts = np.array(list(word_counts.values())) word_probs = word_counts / text_size entropy = -1 * sum(map(self.prob_info, word_probs)) return entropy def calc_lda_features(self, tokenized_text): num_topics = len(self.topics_words) unique_words = set(tokenized_text) num_unique_words = float(len(unique_words)) lda_features = [ len(unique_words.intersection(topic_words)) / num_unique_words for topic_words in self.topics_words ] return lda_features def calc_sentiment_features(self, text): min_polarity, max_polarity = -.1, .1 blob = TextBlob(text) polarities = [ sentence.sentiment.polarity for sentence in blob.sentences ] polarities = [round(polarity, 2) for polarity in polarities] polarity_entropy = self.calc_entropy(polarities) polarity_var = np.var(polarities) num_pos_sents = len( [polarity for polarity in polarities if polarity > max_polarity]) num_neg_sents = len( [polarity for polarity in polarities if polarity < min_polarity]) num_sents = float(len(polarities)) pos_sent_freq, neg_sent_freq = num_pos_sents / num_sents, num_neg_sents / num_sents num_neutral_sents = num_sents - num_pos_sents - num_neg_sents max_pol, min_pol = np.max(polarities) if polarities else 0, min( polarities) if polarities else 0 subjectivities = [ sentence.sentiment.subjectivity for sentence in blob.sentences ] subjectivities = [round(x, 2) for x in subjectivities] subj_var = np.var(subjectivities) max_subj, min_subj = np.max(subjectivities) if polarities else 0, min( subjectivities) if polarities else 0 sentiment_features = [ polarity_entropy, polarity_var, num_pos_sents, num_neg_sents, num_neutral_sents, pos_sent_freq, neg_sent_freq, num_sents, max_pol, min_pol, subj_var, max_subj, min_subj ] return sentiment_features def update_features(self, features_dic): def calc_features(text): words = self.tokenizer(text) entropy = self.calc_entropy(words) lda_features = self.calc_lda_features(words) sentiment_features = self.calc_sentiment_features(text) features = [entropy, *lda_features, *sentiment_features] return features self.pipeline_dic['features'] = features_dic self.update_lda(features_dic) self.X_train = np.hstack( (self.tfidf_train, np.array( [np.array(calc_features(text)) for text in self.text_train]))) self.X_test = np.hstack( (self.tfidf_test, np.array( [np.array(calc_features(text)) for text in self.text_test]))) def grid_search(self, step_grids): def get_step_dics(grid): param_names = list(grid.keys()) param_val_combos = list(product(*list(grid.values()))) num_params = len(param_names) step_dics = [{ param_names[j]: param_val_combo[j] for j in range(num_params) } for param_val_combo in param_val_combos] return step_dics steps = list(step_grids.keys()) num_steps = len(steps) grids = list(step_grids.values()) step_dics = list(map(get_step_dics, grids)) pipeline_combos = list(product(*step_dics)) pipeline_dics = [{ steps[i]: pipeline_combo[i] for i in range(num_steps) } for pipeline_combo in pipeline_combos] pipeline_scores = [[pipeline_dic, self.score(pipeline_dic)] for pipeline_dic in pipeline_dics] pipeline_scores.sort(key=lambda x: x[1], reverse=True) return pipeline_scores def score(self, pipeline_dic): tfidf_vectorizer = TfidfVectorizer(**pipeline_dic['tfidf']) keep_tfidf = self.keep_tfidf(pipeline_dic['tfidf']) if not keep_tfidf: self.update_tfidf(pipeline_dic['tfidf']) keep_features = keep_tfidf and self.keep_features( pipeline_dic['features']) if not keep_features: self.update_features(pipeline_dic['features']) self.model_builder = self.model_builders[pipeline_dic['model']['type']] model_dic = { key: value for key, value in pipeline_dic['model'].items() if key != 'type' } self.model = self.model_builder(**model_dic) self.model.fit(self.X_train, self.Y_train) Y_pred = self.model.predict(self.X_test) score = accuracy(Y_pred, self.Y_test) print(f"Params = {pipeline_dic}, score = {score}. \n") return score
def visit(self, featureset): try: # TODO: outsource into method "set_tokenizer" (tokenizer as member - no extraction_target required then) tokenizer = None if self._extraction_target == "word": tokenizer = LemmaTokenizer(LanguageProcessor()) elif self._extraction_target == "pos": tokenizer = POSTokenizer(LanguageProcessor()) elif self._extraction_target == "ne_simple": tokenizer = NamedEntityTokenizer(LanguageProcessor()) elif self._extraction_target == "ne_detailed": tokenizer = NamedEntityTokenizer(LanguageProcessor(), detailed=True) elif self._extraction_target.startswith("wordlist"): path = self._extraction_target.split("_")[1] tokenizer = WordlistEntryTokenizer(LanguageProcessor(), wordlist=path) # TODO: outsource into method "set_vectorizer" (vectorizer as member - no measure required then) print(self._ngram) print(self._column) vectorizer = None binary = self._measure == "presence" or self._extraction_type == "presence" if self._ngram is None: if self._measure == "tfidf": vectorizer = TfidfVectorizer(tokenizer=tokenizer) else: vectorizer = CountVectorizer(tokenizer=tokenizer, binary=binary) else: if self._measure == "tfidf": vectorizer = TfidfVectorizer(tokenizer=tokenizer, ngram_range=self._ngram) else: vectorizer = CountVectorizer(tokenizer=tokenizer, ngram_range=self._ngram, binary=binary) temp_column = featureset.get_featureset()[self._column] temp_column = temp_column.values new_column = [] "Note: Presence and Count for every(einzeln) feature or for all(alle) feature" if self._extraction_type == "bow" or self._extraction_type == "ngram": # Return Matrix new_column = list( vectorizer.fit_transform(temp_column).toarray()) elif self._extraction_type == "list": # Return String Array analyzer = vectorizer.build_tokenizer() for row in temp_column: print(row) print(analyzer(row)) new_column.append(analyzer(row)) elif self._extraction_type == "presence": # Return Numeric Array analyzer = vectorizer.build_tokenizer() for row in temp_column: new_column.append(1 if len(analyzer(row)) > 0 else 0) # new_column.append(len(analyzer(row)) > 0) elif self._extraction_type == "count": # Return Numeric Array analyzer = vectorizer.build_tokenizer() for row in temp_column: new_column.append(len(analyzer(row))) return new_column except Exception as error: util.print_error("Failed to use Language Processor " + str(error)) util.print_detailed_error()
class file_index(object): "Use n_jobs = 1 for now." def __init__(self, input_file, index_file=None, mmap=True, wsize=10, vectorizer=None, encoding='latin1', sampsize=50, n_jobs=1, chunk_size=1000, verbose=True): self.mmap = mmap self.memory = ":memory:" if not (vectorizer is None): self.vectorizer = vectorizer self.tokenizer = vectorizer.build_tokenizer() self.encoder = encoding self.index_file = self.memory if ( not index_file or index_file == ":memory:") else index_file self.chunk_size = chunk_size self.input_file = input_file self.wsize = wsize self.n_jobs = n_jobs self.verbose = verbose self.sampsize = sampsize if not os.path.exists(self.index_file): self.connect() self.cursor.execute("create table words (word text, coo text)") else: self.connect() self.load_input() def __enter__(self): self.connect() return self def __exit__(self): self.disconnect() return self def windows(self, word): if self.n_jobs != 1: self.connect() try: self.index_lines except AttributeError: self.load_input() if self.sampsize > 0: query = "select * from words where word=? order by random() limit ?" t = (word, self.sampsize) self.cursor.execute(query, t) else: query = "select * from words where word=?" self.cursor.execute(query, (word, )) coordinates = self.str2tup([t for w, t in self.cursor.fetchall()]) windows = [] for r, w in coordinates: try: ln = self.index_lines[r].split() #decode("utf-8").split() except UnicodeDecodeError: continue except AttributeError: print( "\nCall 'load_input()' method before querying windows.\n") raise start = min(len(ln[0:w]), self.wsize) windows.append(ln[w - start:w] + ln[w + 1:w + (self.wsize + 1)]) if self.verbose > 10: logging.info("Got windows for '%s'\n" % word) return windows def fit(self): with open(self.input_file, mode='rb') as f: # encoding='latin-1', mode='rb') as f: if self.index_file != self.memory and self.chunk_size > 0: c = 0 ck = 0 for n, row in enumerate(enumerate(f)): #st() self.index_row(n, row[1]) if c == self.chunk_size: c = 0 self.conn.commit() if self.verbose > 5: logging.info( "Saved index chunk %d into index file %s \n" % (ck, self.index_file)) ck += 1 c += 1 else: if self.verbose: logging.info("Creating index in-memory database... \n") for n, row in enumerate(get_binary(self.input_file)): self.index_row(n, row) try: self.cursor.execute("create index idxword on words(word)") self.conn.commit() # Getting properties self.cursor.execute("SELECT * FROM words") self.vocab = list(set([r[0] for r in self.cursor.fetchall()])) self.vocab_size = len(self.vocab) if self.verbose: logging.info("Saved index into index file datbase %s\n" % self.index_file) return self except: print("Database couldn't be created... EXIT error.") raise def load_input(self): """ Call this method when a prefitted index db file already exists""" with open(self.input_file, mode='rb') as fc: # encoding=self.encoder, mode='rb') as fc: self.index_lines = fc.readlines() self.cursor.execute("SELECT * FROM words") self.vocab = list(set([r[0] for r in self.cursor.fetchall()])) self.vocab_size = len(self.vocab) logging.info("Loaded index database properties and connections..") # Return pointer to the index return self def connect(self): self.conn = sqlite3.connect(self.index_file, check_same_thread=False) self.cursor = self.conn.cursor() return self def disconnect(self): self.conn.commit() self.conn.close() return self def tup2str(self, t): if isinstance(t, list): return [str(a) + ',' + str(b) for a, b in t] else: return str(t[0]) + ',' + str(t[1]) def str2tup(self, t): if isinstance(t, list): r = [] for x in t: r.append(self.str2tup(x)) return r else: a, b = t.split(',') return (int(a), int(b)) def index_row(self, line_id, row, conn=None): if self.n_jobs != 1 and self.n_jobs != 0: cursor = conn.cursor() else: cursor = self.cursor for of, word in enumerate(self.tokenize(row)): if word is None: continue t = (word, self.tup2str((line_id, of))) insert = "INSERT INTO words VALUES (?, ?)" try: cursor.execute(insert, t) except sqlite3.OperationalError: print("Problems to create word table '%s'.\n" % word) self.disconnect() raise def tokenize(self, string): if self.tokenizer: if self.vectorizer.lowercase: try: string = string.decode(errors="replace").lower() except Exception as e: logging.info( "Problems occurred while indexing row: {}\nEXCEPTION: {}" .format(row, e)) return None return [w.encode() for w in self.tokenizer(string)] else: self.vectorizer = TfidfVectorizer() self.tokenizer = self.vectorizer.build_tokenizer() return self.tokenize(string)
def main(): seed = 9001 combined_data = read_all_data() # Create train/test split of data x_train, x_test, y_train, y_test = train_test_split( combined_data["headline"], combined_data["is_clickbait"], random_state=seed) if len(sys.argv) > 1: print() print("Loading pickle...") print() pipe = utils.unpickle_gzip("models/pipeline.pickle.gz") else: print() print("Training...") print() # Instantiate TfidVectrorizer to translate text data to feature vectors # such that they can be used as inputs for an estimator tf_v = TfidfVectorizer(strip_accents='unicode') # With the vectorizer trained, let's load some different estimators clf = LogisticRegressionCV( cv=5, solver='saga', random_state=seed, ) pipe = make_pipeline(tf_v, clf) pipe.fit(x_train, y_train) print() print("Predicting...") print() predictions = pipe.predict(x_test) utils.print_evaluation(y_test, predictions) if len(sys.argv) <= 1: print() print("Pickling...") print() utils.pickle_gzip(pipe, "models/pipeline.pickle.gz") # CANNOT RUN DUE TO MEMORY # rfc = RandomForestClassifier( # n_jobs=-1, # n_estimators=1000, # random_state=seed, # verbose=3) # predictions = rfc.predict(x_test) # utils.print_evaluation(y_test, predictions) print("\n\nPlotting frequency of word use . . .") plot_split_word_freqs(combined_data, tf_v.build_preprocessor(), tf_v.build_tokenizer())
def visit(self, featureset): try: # TODO: outsource into method "set_tokenizer" (tokenizer as member - no extraction_target required then) tokenizer = None if self._extraction_target == "word": tokenizer = LemmaTokenizer(LanguageProcessor()) elif self._extraction_target == "pos": tokenizer = POSTokenizer(LanguageProcessor()) elif self._extraction_target == "ne_simple": tokenizer = NamedEntityTokenizer(LanguageProcessor()) elif self._extraction_target == "ne_detailed": tokenizer = NamedEntityTokenizer(LanguageProcessor(), detailed=True) elif self._extraction_target.startswith("wordlist"): path = self._extraction_target.split("_")[1] tokenizer = WordlistEntryTokenizer(LanguageProcessor(), wordlist=path) # TODO: outsource into method "set_vectorizer" (vectorizer as member - no measure required then) print(self._ngram) print(self._column) vectorizer = None binary = self._measure == "presence" or self._extraction_type == "presence" if self._ngram is None: if self._measure == "tfidf": vectorizer = TfidfVectorizer(tokenizer=tokenizer) else: # TODO: here it is absolute term-frequency - what about relative? # For ngrams not easy: # - needs to count the amount of n-gram for each document and divide each feature generated from # the ngram-counts of the document by that amount # For named-entities: # - count words inside named entities (not just the amount of NEs) devide by num tokens of doc # ... vectorizer = CountVectorizer(tokenizer=tokenizer, binary=binary) else: if self._measure == "tfidf": vectorizer = TfidfVectorizer(tokenizer=tokenizer, ngram_range=self._ngram) else: vectorizer = CountVectorizer(tokenizer=tokenizer, ngram_range=self._ngram, binary=binary) temp_column = featureset.get_featureset()[self._column] temp_column = temp_column.values new_column = [] "Note: Presence and Count for every(einzeln) feature or for all(alle) feature" if self._extraction_type == "bow" or self._extraction_type == "ngram": # Return Matrix new_column = list(vectorizer.fit_transform(temp_column).toarray()) elif self._extraction_type == "list": # Return String Array analyzer = vectorizer.build_tokenizer() for row in temp_column: print(row) print(analyzer(row)) new_column.append(analyzer(row)) elif self._extraction_type == "presence": # Return Numeric Array analyzer = vectorizer.build_tokenizer() for row in temp_column: new_column.append(1 if len(analyzer(row)) > 0 else 0) # new_column.append(len(analyzer(row)) > 0) elif self._extraction_type == "count": # Return Numeric Array analyzer = vectorizer.build_tokenizer() for row in temp_column: new_column.append(len(analyzer(row))) return new_column except Exception as error: util.print_error("Failed to use Language Processor " + str(error)) util.print_detailed_error()
def __init__(self, vectorizer: TfidfVectorizer): self.vectorizer = vectorizer self.vocab = vectorizer.get_feature_names() self.tokenizer = vectorizer.build_tokenizer()
if __name__ == '__main__': #load the vocab file train_list = get_values('train.txt') test_list = get_values('test.txt') valid_list = get_values('valid.txt') targetlist = valid_list l = 100000 targetlist = targetlist[:l] all_docs_str, all_docs_list = makedocs() file = open("evidence_dev.txt", "w", encoding='utf-8') cnt = 0 wholestring = "" tiv = TfidfVectorizer(stop_words="english").fit(all_docs_str) tokenizer = tiv.build_tokenizer() all_docs_numpy = tiv.transform(all_docs_str) all_docs_text_numpy = np.array(all_docs_list) #all doc key doc_keys = pickle.load(file=open("dockey.pkl", 'rb')) # way to find relative docs importantword = False for crset in tqdm(targetlist): #find doc #temp = TfidfVectorizer(stop_words="english").fit([crset]) #print(set(co_command.keys()).intersection(set(temp.vocabulary_.keys()))) #if set(co_command.keys()).intersection(set(temp.vocabulary_.keys())) is None:
class FeatureExtractor: vectorizer = None feature_names = None feature_matrix = None features = None def train_extractor_from_lines(self, lines): self.vectorizer = TfidfVectorizer(tokenizer=tokenizer, max_features=DISTINCT_WORDS_CNT) self.vectorizer.fit(lines) pass def train_extractor(self, full=False): lines = dir2lines_labels('data/train_lite.csv') self.train_extractor_from_lines(lines) pass def get_features_distance_matrix(self, feature_df): mat = np.zeros((len(feature_df), len(feature_df))) for index1, row1 in feature_df.iterrows(): for index2, row2 in feature_df.iterrows(): mat[index1, index2] = self.feature_distance(row1, row2) return mat def get_lines_distance_matrix(self, lines): feature_df = self.lines2features(lines) self.features = feature_df return self.get_features_distance_matrix(feature_df) def lines2features(self, lines, use_tense=False): """ returns DataFrame(feature_matrix, feature_name) ['word_rainny', 'word_'sunny'], array([ [1, 0.4, 0.2], [0.2, 1, 0.2], ]) """ self.feature_names = [] self.feature_matrix = None # tf-idf features data = self.vectorizer.transform(lines).toarray() self.feature_names = self.vectorizer.get_feature_names() self.feature_matrix = data # additional features add_features = [] important_words = [ 'sunny', 'wind', 'humid', 'hot', 'cold', 'dry', 'ice', 'rain', 'snow', 'tornado', 'storm', 'hurricane' ] important_words = [ 'cloud', 'cold', 'dry', 'hot', 'humid', 'hurricane', 'ice', 'rain', 'snow', 'storm', 'sunny', 'tornado', 'wind' ] self.feature_names = self.feature_names + [ 'impt_words:' + word for word in important_words ] if use_tense: self.feature_names = self.feature_names + [ 'past_tense_num', 'present_tense_num' ] all_words = self.lines2words(lines) for words in all_words: # important words important_words_ftr = [ int(word in words) for word in important_words ] add_features.append(important_words_ftr) # tense if use_tense: tagz = zip(*nltk.pos_tag(nltk.word_tokenize(words)))[1] past_num = len([v for v in tagz if v == 'VBD']) present_num = len([v for v in tagz if v in ['VBP', 'VB']]) add_features.append([past_num, present_num]) self.feature_matrix = np.hstack((self.feature_matrix, add_features)) return DataFrame(self.feature_matrix, columns=self.feature_names) def feature_distance(self, feature_vector1, feature_vector2): # preliminary version return 1 - np.dot(feature_vector1, feature_vector2) / np.sqrt( (np.dot(feature_vector1, feature_vector1) * np.dot(feature_vector2, feature_vector2))) def lines2words(self, lines): self.tokenizer = self.vectorizer.build_tokenizer() return [self.tokenizer(line) for line in lines] def load_vectorizer(self): input_file = open('models/tfidf_vectorizer.pkl', 'rb') self.vectorizer = pickle.load(input_file) input_file.close() pass def save_vectorizer(self): output_file = open('models/tfidf_vectorizer.pkl', 'wb') pickle.dump(self.vectorizer, output_file) output_file.close() pass
X_test = np.array([''.join(el) for el in nyt_data[trainset_size+1:len(nyt_data)]]) y_test = np.array([el for el in nyt_labels[trainset_size+1:len(nyt_labels)]]) #print(X_train) vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2') test_string = unicode(nyt_data[0]) print "Example string: " + test_string print "Preprocessed string: " + vectorizer.build_preprocessor()(test_string) print "Tokenized string:" + str(vectorizer.build_tokenizer()(test_string)) print "N-gram data string:" + str(vectorizer.build_analyzer()(test_string)) print "\n" X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) nb_classifier = MultinomialNB().fit(X_train, y_train) y_nb_predicted = nb_classifier.predict(X_test) print "MODEL: Multinomial Naive Bayes\n" print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_nb_predicted)) print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_nb_predicted)) print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_nb_predicted))
def weighted_embeddings(esco_df, eperusteet_df, model): """ Create TFIDF weighted embeddings for ESCO and ePerusteet. The input sentences should be separated with newlines. Args: esco_df (DataFrame) : Requires cols 'label' and 'text', where 'text' contains textual representation of ESCO. eperusteet_df (DataFrame) : Requires cols 'label' and 'text', where 'text' contains textual representation of ePerusteet. model (fasttext.model) : Model for word-embeddings. Return: X_esco (xArray) : Embeddings for ESCO texts. X_eperusteet (xArray) : Embeddings for ePerusteet texts. """ assert isinstance(esco_df, pd.DataFrame) assert isinstance(eperusteet_df, pd.DataFrame) text_esco = esco_df["text"] text_eperusteet = eperusteet_df["text"] # Do not sort - to we can resplit using the indices combined_texts = pd.concat([text_esco, text_eperusteet], sort=False) vectorizer = TfidfVectorizer() vectorizer.fit(combined_texts) tokenizer = vectorizer.build_tokenizer() feature_array = vectorizer.get_feature_names() identifiers = [] embeddings = [] for _, row in tqdm(esco_df.iterrows(), total=esco_df.shape[0], desc="Computing embeddings for ESCOs"): identifiers.append(row["label"]) texts = row["text"].split("\n") # Take average over the sentences competence_embedding = xr.DataArray(np.zeros(model.get_dimension()), dims=["embedding"]) for text in texts: sentence_embedding = xr.DataArray(np.zeros(model.get_dimension()), dims=["embedding"]) weights = vectorizer.transform([text]) nonzero_indexes = weights.nonzero() weights = np.asarray(weights[nonzero_indexes][0]).reshape((-1, )) weights = [w / sum(weights) for w in weights] weight_dict = { feature_array[idx]: weights[i] for i, idx in enumerate(nonzero_indexes[1]) } for word in text.split(" "): try: token = tokenizer(word)[0] except IndexError: continue weight = weight_dict[token] sentence_embedding += (model[word] * weight) competence_embedding += sentence_embedding # If the texts was empty, avoid division and add the 0-vector if not texts: competence_embedding = competence_embedding / len(texts) embeddings.append(competence_embedding) embeddings = np.stack(embeddings, axis=0) esco_embeddings = xr.DataArray(embeddings, coords={"ESCO": identifiers}, dims=["ESCO", "embedding"]) identifiers = [] embeddings = [] for _, row in tqdm(eperusteet_df.iterrows(), total=eperusteet_df.shape[0], desc="Computing embeddings for ePerusteet"): identifiers.append(row["label"]) texts = row["text"].split("\n") # Take average over the sentences degree_embedding = xr.DataArray(np.zeros(model.get_dimension()), dims=["embedding"]) for text in texts: sentence_embedding = xr.DataArray(np.zeros(model.get_dimension()), dims=["embedding"]) weights = vectorizer.transform([text]) nonzero_indexes = weights.nonzero() weights = np.asarray(weights[nonzero_indexes][0]).reshape((-1, )) weights = [w / sum(weights) for w in weights] weights = { feature_array[idx]: weights[i] for i, idx in enumerate(nonzero_indexes[1]) } for word in text.split(" "): try: token = tokenizer(word)[0] except IndexError: continue weight = weights[token] sentence_embedding += (model[word] * weight) degree_embedding += sentence_embedding # If the texts was empty, avoid division and add the 0-vector if not texts: degree_embedding = degree_embedding / len(texts) embeddings.append(degree_embedding) embeddings = np.stack(embeddings, axis=0) eperusteet_embeddings = xr.DataArray(embeddings, coords={"ePerusteet": identifiers}, dims=["ePerusteet", "embedding"]) return esco_embeddings, eperusteet_embeddings
def main(): accuracies = defaultdict(lambda: []) aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=False, ngram_range=(1, 1), token_pattern='\\b\\w+\\b')#, tokenizer=StemTokenizer()) # # vct = CountVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=True, ngram_range=(1, 1), # token_pattern='\\b\\w+\\b')#, tokenizer=StemTokenizer()) vct_analizer = vct.build_tokenizer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] min_size = 10 # max(10, args.fixk) # if args.fixk < 0: args.fixk = None data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True) print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) print ("Vectorizer: %s" % vct) parameters = parse_parameters_mat(args.cost_model) print "Cost Parameters %s" % parameters cost_model = set_cost_model(args.cost_function, parameters=parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ ### SENTENCE TRANSFORMATION sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') ## delete <br> to "." to recognize as end of sentence data.train.data = clean_html(data.train.data) data.test.data = clean_html(data.test.data) # labels, sent_train = split_data_sentences(data.train, sent_detector) # # data.train.data = sent_train # data.train.target = np.array(labels) # labels, sent_train = split_data_sentences(data.test, sent_detector) # data.test.data = sent_train # data.test.target = np.array(labels) print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0])) ## Get the features of the sentence dataset # data.train.bow = vct.fit_transform(data.train.data) data.test.bow = vct.transform(data.test.data) #### EXPERT CLASSIFIER exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty) exp_clf.fit(data.test.bow, data.test.target) expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold, cost_function=cost_model.cost_function) print "\nExpert: %s " % expert #### STUDENT CLASSIFIER clf = linear_model.LogisticRegression(penalty="l1", C=1) # clf = set_classifier(args.classifier) student = structured.AALStructured(model=clf, accuracy_model=None, budget=args.budget, seed=args.seed, vcn=vct, subpool=250, cost_model=cost_model) student.set_score_model(exp_clf) print "\nStudent Classifier: %s" % clf #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 print ("Sentence Classification") t0 = time.time() tac = [] tau = [] # predition = exp_clf.predict(data.train.bow) print ("Prepare test ... ") ## create sentences from documents based on first k ## random ## best sentence filtered_data = [] bestk = [] bestk_max = [] random_k = [] print "First k=1" for iDoc, y in zip(data.train.data, data.train.target): doc_sent = split_into_sentences([iDoc], sent_detector, vct) random_k.append(doc_sent[random.randint(0, len(doc_sent)-1)]) scores = [best_score_max(iSent, y, exp_clf) for iSent in vct.transform(doc_sent)] best = np.argmax(scores) bestk_max.append(doc_sent[best]) scores = [best_score(iSent, y, exp_clf) for iSent in vct.transform(doc_sent)] best = np.argmax(scores) bestk.append(doc_sent[best]) filtered_data.append(doc_sent[0]) test_firstk = vct.transform(filtered_data) test_random = vct.transform(random_k) test_best = vct.transform(bestk) test_best_max = vct.transform(bestk) targets = data.train.target print"*"*80 accu, auc, predictions = evaluate(exp_clf, test_random, targets, vct) print "RND: ACCU:{}\t AUC:{} \t Predictions:{}".format(accu, auc, predictions.shape[0]) accu, auc, predictions = evaluate(exp_clf, test_firstk, targets, vct) print "FIRSTK: ACCU:{}\t AUC:{} \t Predictions:{}".format(accu, auc, predictions.shape[0]) accu, auc, predictions = evaluate(exp_clf, test_best, targets, vct) print "BEST: ACCU:{}\t AUC:{} \t Predictions:{}".format(accu, auc, predictions.shape[0]) accu, auc, predictions = evaluate(exp_clf, test_best_max, targets, vct) print "BESTMAX: ACCU:{}\t AUC:{} \t Predictions:{}".format(accu, auc, predictions.shape[0]) # print"*"*80 # print "STUDENT" # clf.fit(test_random, targets) # accu, auc, predictions = evaluate(clf, data.test.bow, data.test.target, vct) # print "RND: ACCU:{}\t AUC:{} \t Predictions:{}".format(accu, auc, predictions.shape[0]) # clf.fit(test_firstk, targets) # accu, auc, predictions = evaluate(clf, data.test.bow, data.test.target, vct) # print "FIRSTK: ACCU:{}\t AUC:{} \t Predictions:{}".format(accu, auc, predictions.shape[0]) # clf.fit(test_best, targets) # accu, auc, predictions = evaluate(clf, data.test.bow, data.test.target, vct) # print "BEST: ACCU:{}\t AUC:{} \t Predictions:{}".format(accu, auc, predictions.shape[0]) print("Elapsed time %.3f" % (time.time() - t0))
__author__ = '315-4' # -*- coding: utf-8 -*- from gensim import corpora, models, matutils from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer dataset=fetch_20newsgroups() # датасет - 20 групп новостей vect = TfidfVectorizer() # конвертор в матрицу TF-IDF tok = vect.build_tokenizer() # токенизатор texts = [] # токенизация текстов for text in dataset.data: texts.append(tok(text)) # на сцену выходит gensim # Convert document (a list of words) into the bag-of-words dictionary = corpora.Dictionary(texts) # создаем словарь (сет токенов) corpus = [dictionary.doc2bow(text) for text in texts] # корпус new_vec = dictionary.doc2bow((tok('Hello world'))) # это нигде не используется # Обучение LDA модели lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,\ num_topics=100, update_every=1, chunksize=10000, passes=1) # выводим матрицу V из UEV разложения for item in lda.print_topics(100): print (item)
X_test = np.array([''.join(el) for el in nyt_data[trainset_size + 1:len(nyt_data)]]) y_test = np.array([el for el in nyt_labels[trainset_size + 1:len(nyt_labels)]]) #print(X_train) vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2') test_string = unicode(nyt_data[0]) print "Example string: " + test_string print "Preprocessed string: " + vectorizer.build_preprocessor()(test_string) print "Tokenized string:" + str(vectorizer.build_tokenizer()(test_string)) print "N-gram data string:" + str(vectorizer.build_analyzer()(test_string)) X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) svm_classifier = LinearSVC().fit(X_train, y_train)
class NLPPipeline(): def __init__(self, text, Y, train_size=.85): self.model_builders = {'dtc': dtc, 'rfc': rfc} steps = ['tfidf', 'feature_engineering', 'lda', 'model'] self.pipeline_dic = {step: None for step in steps} self.text_train, self.text_test, self.Y_train, self.Y_test = split( text, Y, train_size=train_size, stratify=Y) self.keep_tfidf = lambda tfidf_dic: (tfidf_dic == self.pipeline_dic[ 'tfidf']) self.keep_features = lambda features_dic: (features_dic == self. pipeline_dic['features']) self.prob_info = lambda prob: -prob * np.log(prob) self.pipeline_dic = {step: "Default" for step in steps} self.train_size = train_size def update_tfidf(self, tfidf_dic): self.pipeline_dic['tfidf'] = tfidf_dic self.tfidf = TfidfVectorizer(**tfidf_dic) self.tfidf_train = self.tfidf.fit_transform(self.text_train) self.tfidf_train = self.tfidf_train.toarray() self.tokenizer = self.tfidf.build_tokenizer() self.tfidf_test = self.tfidf.transform(self.text_test) self.tfidf_test = self.tfidf_test.toarray() self.feature_names = self.tfidf.get_feature_names() def update_lda(self, lda_dic): def calc_topics_words(num_top_words): topics_words = [] for ix, topic in enumerate(self.lda.components_): top_word_inds = topic.argsort()[:-num_top_words - 1:-1] topic_words = set( [self.feature_names[i] for i in top_word_inds]) topics_words.append(topic_words) return topics_words num_top_words = lda_dic[ 'num_top_words'] if 'num_top_words' in lda_dic else 10 lda_model_dic = { k: v for k, v in lda_dic.items() if k != 'num_top_words' } self.lda = LDA(**lda_model_dic) self.lda.fit_transform(self.tfidf_train) self.topics_words = calc_topics_words(num_top_words) def calc_entropy(self, text): ''' Many other equivalent ways to calculate entropy. This seems to be the fastest. 5 x faster than scipy's entropy method.''' word_counts = defaultdict(int) text_size = float(len(text)) for word in text: word_counts[word] += 1 word_counts = np.array(list(word_counts.values())) word_probs = word_counts / text_size entropy = -1 * sum(map(self.prob_info, word_probs)) return entropy def calc_lda_features(self, tokenized_text): num_topics = len(self.topics_words) unique_words = set(tokenized_text) num_unique_words = float(len(unique_words)) lda_features = [ len(unique_words.intersection(topic_words)) / num_unique_words for topic_words in self.topics_words ] return lda_features def calc_sentiment_features(self, text): min_polarity, max_polarity = -.1, .1 blob = TextBlob(text) polarities = [ sentence.sentiment.polarity for sentence in blob.sentences ] polarities = [round(polarity, 2) for polarity in polarities] polarity_entropy = self.calc_entropy(polarities) polarity_var = np.var(polarities) num_pos_sents = len( [polarity for polarity in polarities if polarity > max_polarity]) num_neg_sents = len( [polarity for polarity in polarities if polarity < min_polarity]) num_sents = float(len(polarities)) pos_sent_freq, neg_sent_freq = num_pos_sents / num_sents, num_neg_sents / num_sents num_neutral_sents = num_sents - num_pos_sents - num_neg_sents max_pol, min_pol = np.max(polarities) if polarities else 0, min( polarities) if polarities else 0 subjectivities = [ sentence.sentiment.subjectivity for sentence in blob.sentences ] subjectivities = [round(x, 2) for x in subjectivities] subj_var = np.var(subjectivities) max_subj, min_subj = np.max(subjectivities) if polarities else 0, min( subjectivities) if polarities else 0 sentiment_features = [ polarity_entropy, polarity_var, num_pos_sents, num_neg_sents, num_neutral_sents, pos_sent_freq, neg_sent_freq, num_sents, max_pol, min_pol, subj_var, max_subj, min_subj ] return sentiment_features def update_features(self, features_dic): """ From a dictionary containing parameter labels and values used for building features (currently just LDA), updates feature matrices by re-calculating features for each text. Arguments features_dic (dictionary): A dictionary with string parameter names as keys and ints/floats as values. Example: features_dic = {'n_components': 10, 'n_words': 10} """ def calc_features(text): words = self.tokenizer(text) entropy = self.calc_entropy(words) lda_features = self.calc_lda_features(words) sentiment_features = self.calc_sentiment_features(text) features = [entropy, *lda_features, *sentiment_features] return features self.pipeline_dic['features'] = features_dic self.update_lda(features_dic) self.X_train = np.hstack( (self.tfidf_train, np.array( [np.array(calc_features(text)) for text in self.text_train]))) self.X_test = np.hstack( (self.tfidf_test, np.array( [np.array(calc_features(text)) for text in self.text_test]))) def grid_search(self, step_grids): """ From a nested dictionary containing grids for each pipeline step, fit and score each possible pipeline permutation (nested permutation of the step permutations). Arguments step_grids: A nested dictionary containing the step grid for each step. Example: step_grids = {'tfidf' = {'min_df': [0.1]}, 'features' = {'n_components': [10], num_top_words: [10]}, 'model' = {'type': ['rfc']} } Returns pipeline_scores: A sorted list of 2-tuples containing the pipeline dictionary and score of each pipeline permutation. """ def get_step_perms(grid): """ From grid (dict) mapping each parameter name to a list of values for that parameter, returns the list of all permutations (dicts) that can be made by choosing a different value for each parameter from its values list. Arguments grid ({string: list}): A dictionary mapping parameter names to a list of parameter values. Example: grid = {'min_df': [0.1], 'max_df': [0.8, 0.9]} Returns step_perms ([dict]): A list of all dictionary permutations for the step that can be made by choosing different parameter values from each parameter's domain. Example: For the above grid example, we'd have step_perms = [{'min_df': 0.1, 'max_df: 0.8'}, {'min_df: 0.1', max_df: 0.9}] """ param_names = list(grid.keys()) param_val_perms = list(product(*list(grid.values()))) num_params = len(param_names) step_perms = [{ param_names[j]: param_val_perm[j] for j in range(num_params) } for param_val_perm in param_val_perms] return step_perms steps = list(step_grids.keys()) num_steps = len(steps) grids = list(step_grids.values()) step_perms = list(map(get_step_perms, grids)) pipeline_perms = list(product(*step_perms)) pipeline_perms = [{ steps[i]: pipeline_perm[i] for i in range(num_steps) } for pipeline_perm in pipeline_perms] pipeline_scores = [[ pipeline_perm, round(self.score(pipeline_perm), 3) ] for pipeline_perm in pipeline_perms] pipeline_scores.sort(key=lambda x: x[1], reverse=True) return pipeline_scores def score(self, pipeline_dic): tfidf_vectorizer = TfidfVectorizer(**pipeline_dic['tfidf']) keep_tfidf = self.keep_tfidf(pipeline_dic['tfidf']) if not keep_tfidf: self.update_tfidf(pipeline_dic['tfidf']) keep_features = keep_tfidf and self.keep_features( pipeline_dic['features']) if not keep_features: self.update_features(pipeline_dic['features']) self.model_builder = self.model_builders[pipeline_dic['model']['type']] model_dic = { key: value for key, value in pipeline_dic['model'].items() if key != 'type' } self.model = self.model_builder(**model_dic) self.model.fit(self.X_train, self.Y_train) Y_pred = self.model.predict(self.X_test) score = accuracy(Y_pred, self.Y_test) print(f"Params = {pipeline_dic}, score = {round(score, 3)}. \n") return score
class FeatureExtractor: vectorizer = None feature_names = None feature_matrix = None def train_extractor_from_lines(self, train_lines, labels, test_lines): self.vectorizer = TfidfVectorizer(tokenizer=tokenizer, max_features=DISTINCT_WORDS_CNT) self.vectorizer.fit(train_lines + test_lines) pass def load_vectorizer(self): input_file = open('../models/tfidf_vectorizer.pkl', 'rb') self.vectorizer = pickle.load(input_file) input_file.close() pass def save_vectorizer(self): output_file = open('../models/tfidf_vectorizer.pkl', 'wb') pickle.dump(self.vectorizer, output_file) output_file.close() pass def train_extractor(self, full=False): if not full: train_lines = file2lines('../data/train_lite.csv') labels = file2labels('../data/train_lite.csv') test_lines = file2lines('../data/test_lite.csv') else: train_lines = file2lines('../data/train.csv') labels = file2labels('../data/train.csv') test_lines = file2lines('../data/test.csv') self.train_extractor_from_lines(train_lines, labels, test_lines) pass def lines2words(self, lines): self.tokenizer = self.vectorizer.build_tokenizer() return [self.tokenizer(line) for line in lines] def lines2features(self, lines, use_tense=False): """ returns DataFrame(feature_matrix, feature_name) ['word_rainny', 'word_'sunny'], array([ [1, 0.4, 0.2], [0.2, 1, 0.2], ]) """ self.feature_names = [] self.feature_matrix = None # tf-idf features data = self.vectorizer.transform(lines).toarray() self.feature_names = self.vectorizer.get_feature_names() self.feature_matrix = data # additional features add_features = [] important_words = [ 'sunny', 'wind', 'humid', 'hot', 'cold', 'dry', 'ice', 'rain', 'snow', 'tornado', 'storm', 'hurricane' ] important_words = [ 'cloud', 'cold', 'dry', 'hot', 'humid', 'hurricane', 'ice', 'rain', 'snow', 'storm', 'sunny', 'tornado', 'wind' ] self.feature_names = self.feature_names + [ 'impt_words:' + word for word in important_words ] if use_tense: self.feature_names = self.feature_names + [ 'past_tense_num', 'present_tense_num' ] all_words = self.lines2words(lines) for words in all_words: # important words important_words_ftr = [ int(word in words) for word in important_words ] add_features.append(important_words_ftr) # tense if use_tense: tagz = zip(*nltk.pos_tag(nltk.word_tokenize(words)))[1] past_num = len([v for v in tagz if v == 'VBD']) present_num = len([v for v in tagz if v in ['VBP', 'VB']]) add_features.append([past_num, present_num]) self.feature_matrix = np.hstack((self.feature_matrix, add_features)) return DataFrame(self.feature_matrix, columns=self.feature_names)
# Get predictions Bayes unclassified_tweet_sentiments_bayes = classifier_bayes.predict( unclassified_features) # Store the sentiment in a new column, NOTE 0 is negative, 4 is positive unclassified_df['Sentiment'] = unclassified_tweet_sentiments_bayes unclassified_df.head() # Need code to classify the tweets for the different major political parties, in this case there are 4 major political party categories I will consider in the Canadian Context ***'Liberal', 'Conservative', 'NDP', Others'*** # As this data needs to be assigned to a party, a simple word frequency counter algorithm will be used to assign to each party # Preporcessor and tokenizer code preprocessor = vectorizer.build_preprocessor() tokenizer = vectorizer.build_tokenizer() # Defining the bag_of_words function def bag_of_words(tw): '''(str) -> dict Input: a string tw (a tweet line) Output: a python dictionary ''' unigram_ls = tokenizer(preprocessor(tw)) #Create an empty dictionary bag_words = {} #Run through tokenized unigram list for item in unigram_ls:
__author__ = 'Alena' from sklearn.datasets import fetch_20newsgroups dataset=fetch_20newsgroups() from sklearn.feature_extraction.text import TfidfVectorizer vect = TfidfVectorizer() tok=vect.build_tokenizer() texts=[] Y=vect.fit_transform(dataset.data) first=Y.getcol(0) second=Y.getcol(1) word1=[] for i, el in enumerate(first): word1.append(first._get_single_element(i,0)) word2=[] for i, el in enumerate(second): word2.append(second._get_single_element(i,0)) distance=0 for i in range(len(word2)): distance+=absmod=word1[i]-word2[i](mod) print(distance)