def build_text_processor( tokenize=True, lowercase=True, strip_accents='unicode', **kwargs, ): """ Generates a text preprocessor from sklearn CountVectorizer tools It is based on sklearn CountVectorizer functionalities. tokenize means that the input string will be tokenized as words before being glued back with single spaces. Its purpose is to handle whitespaces (newlines, tabs, multiple spaces, ...) and punctuation. kwargs are directly passed to CountVectorizer constructor, and will serve to process the texts. Most useful args are 'strip_accent' and 'lowercase'. """ preprocessor_countvect = CountVectorizer( lowercase=lowercase, strip_accents=strip_accents, **kwargs, ) preprocessor = preprocessor_countvect.build_preprocessor() tokenizer = preprocessor_countvect.build_tokenizer() if tokenize: def transformer(x): return (' '.join(tokenizer(preprocessor(x)))) else: transformer = preprocessor return (transformer)
def dump_sentences(): corpus = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) docs = corpus.data labels = corpus.target label_names = corpus.target_names vectorizer = CountVectorizer(token_pattern=r'(?u)\b\w+\b') preprocess = vectorizer.build_preprocessor() tokenize = vectorizer.build_tokenizer() def words(doc): p = preprocess(doc) return ' '.join(t.encode('ascii', 'replace') for t in tokenize(p)) doccount = 0 vocab = set() with open('20news.txt', 'w') as f: for doc, lbl in zip(docs, labels): w = words(doc) print >> f, label_names[lbl] print >> f, w doccount += 1 vocab.update(w.split(' ')) print 'Number of documents:', doccount print 'Number of unique words:', len(vocab)
def nlp(self, model): if model == "default": cv = CountVectorizer(lowercase=self.lower_case) sk_word_tokenize = cv.build_tokenizer() sk_preprocesser = cv.build_preprocessor() self._nlp = lambda doc: sk_word_tokenize(sk_preprocesser(doc)) else: self._nlp = model
def Common_Vectorizer_usage(): from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=1) corpus = [ 'This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?', ] analyze = vectorizer.build_analyzer() print analyze("This is a text document to analyze.") print analyze("This is a text document to analyze.") == ['this', 'is', 'text', 'document', 'to', 'analyze'] X=vectorizer.fit_transform(corpus) print vectorizer.get_feature_names() print vectorizer.vocabulary_ #.get('document') print vectorizer.transform(['Something completely new.']).toarray() print list(X) #bigram======================================================== bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1) analyze = bigram_vectorizer.build_analyzer() print analyze('Bi-grams are cool!') X_2 = bigram_vectorizer.fit_transform(corpus).toarray() print X_2 feature_index = bigram_vectorizer.vocabulary_.get('is this') print X_2[:, feature_index] #marui test print '\n\nmarui test=====================' def t_preprocessor(s): return ','.join([x.lower() for x in s.split(' ')]) stop_words1=['is','a','this'] #is ok: frozenset(['a', 'this', 'is']) stop_words2={'is':0,'a':1,'this':2} #is ok: convert to frozenset(['a', 'this', 'is']) cv = CountVectorizer(preprocessor=t_preprocessor,stop_words=stop_words2) params=cv.get_params() print 'get_params()',type(params),'---------------' for k in params: print k,'\t',params[k] print 'get_params end--------------' print '\nget_stop_words=',cv.get_stop_words() cv.fit(corpus) print cv.get_feature_names() print cv.transform(corpus).toarray() print '\n测试preprocesser, result:\t',cv.build_preprocessor()('this is a document') print '\n测试tokenizer,result',cv.build_tokenizer()('this is a document') print '\n测试tokenizer2,result',cv.build_tokenizer()('th-is is a document') print '\n测试tokenizer2,result',cv.build_tokenizer()('th_is is a document') print '\n测试tokenizer2,result',cv.build_tokenizer()('th&is is a document') """
def __init__(self, mask_dates, max_length=MAX_LENGTH): # Steal the defaul preprocessor and tokenizer from sklearn v = CountVectorizer() self.max_length = max_length self.dat = re.compile(r'\b\d{1,2}\-?[a-z]{3}\-?\d{2,4}\b') if mask_dates: self.preprocess = lambda x: self.dat.sub('<DATE>', str(x).lower()) else: self.preprocess = v.build_preprocessor() self.tokenize = v.build_tokenizer() self.is_num = re.compile(r'\b\d+\b') # isolated numbers
def create_feature_matrix_token_counts(self): ''' Create a n by m matrix of n twitter messages with m features representing count of preprocessed, stemmed, tokenized words :return: n by m feature matrix of n twitter messages and m features (i.e. word tokens) ''' #Create the basic count vectorizer so that we can copy its preprocessor and tokenizer basic_vectorizer = CountVectorizer(stop_words='english') preprocessor = basic_vectorizer.build_preprocessor(); tokenizer = basic_vectorizer.build_tokenizer(); #Create a stemmer for additional processing after preprocessing and tokenizer stemmer = EnglishStemmer() #Custom analyzer for Count Vectorizer which stems tokens after preprocessing def stemming_analyzer(document): if self.filter_numbers: return [token for token in map(stemmer.stem, tokenizer(preprocessor(document))) if not vec_tools.number_pattern().search(token)] else: return map(stemmer.stem, tokenizer(preprocessor(document))) if self.uni_bi_gram: vectorizer = CountVectorizer(stop_words='english', min_df=2, analyzer="char_wb", ngram_range=(1,2)) else: vectorizer = CountVectorizer(stop_words='english', min_df=self.min_df, analyzer=stemming_analyzer) all_twitter_msg_text = [t.msg_text for t in self.twitter_messages] all_twitter_msg_polarity = [t.polarity for t in self.twitter_messages] if self.filter_url_hashtag_username: vec_tools.filter_url_username_hashtag(all_twitter_msg_text) self.feature_matrix_token_counts = vectorizer.fit_transform(all_twitter_msg_text) if self.select_k_best: self.feature_matrix_token_counts = SelectKBest(chi2,self.k).fit_transform(self.feature_matrix_token_counts, all_twitter_msg_polarity) self.token_feature_names = [i for i in range(self.feature_matrix_token_counts.shape[1])] self.amount_of_token_features = self.feature_matrix_token_counts.shape[1] else: self.token_feature_names = vectorizer.get_feature_names() self.amount_of_token_features = len(self.token_feature_names) return self.feature_matrix_token_counts
def run(): ''' create a product dictionary based on all tokens in the best buy product corpus ''' soup = BeautifulSoup(open(constants.BESTBUY_PRODUCT_CORPUS_FILE, 'rb'), 'html.parser') vectorizer = CountVectorizer(strip_accents='ascii') tokenizer = vectorizer.build_tokenizer() preprocessor = vectorizer.build_preprocessor() tokens = set() for item in tokenizer(soup.get_text()): tokens.add(preprocessor(item)) with codecs.open(constants.PERSONAL_WORD_DICTIONARY_FILE, mode='wb', encoding='utf-8') as f: for token in tokens: f.write(token + '\n')
def dump_reviews(): download() print 'making dataset' vectorizer = CountVectorizer(token_pattern=r'(?u)\b\w+\b') preprocess = vectorizer.build_preprocessor() tokenize = vectorizer.build_tokenizer() def dumbascii(thing): try: thing.encode('ascii', 'replace') return True except UnicodeDecodeError: return False def words(doc): p = preprocess(doc) return ' '.join(t.encode('ascii', 'replace') for t in tokenize(p) if dumbascii(t)) doccount = 0 vocab = set() with open('reviews.txt', 'w') as fout: for topicdir in DIRS: with open(os.path.join(topicdir, POSREV), 'r') as f: text = f.read() for doc in REVREGEX.findall(text): w = words(doc) print >> fout, 'positive' print >> fout, w doccount += 1 vocab.update(w.split(' ')) with open(os.path.join(topicdir, NEGREV), 'r') as f: text = f.read() for doc in REVREGEX.findall(text): w = words(doc) print >> fout, 'negative' print >> fout, w doccount += 1 vocab.update(w.split(' ')) print 'Number of documents:', doccount print 'Number of unique words:', len(vocab)
def get_sparse_repr(docs, V, sort_data): from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(stop_words="english", max_features=V) default_preproc = vectorizer.build_preprocessor() def preproc(s): return re.sub(r' \d+ ', 'anumber ', default_preproc(s)) vectorizer.preprocessor = preproc counts = vectorizer.fit_transform(docs).astype(np.uint32) words = vectorizer.get_feature_names() if sort_data: counts, words = sort_vocab(counts, words) assert is_column_sorted(counts) print('loaded {} documents with a size {} vocabulary'.format(*counts.shape)) print('with {} words per document on average'.format(np.mean(counts.sum(1)))) print() return counts, words
class TextTransformer(object): from re import sub def __init__(self): #from nltk.stem.lancaster import LancasterStemmer from sklearn.feature_extraction.text import CountVectorizer import enchant #self.stemmer = LancasterStemmer() self._vectorizer = CountVectorizer(strip_accents='ascii') self.tokenizer = self._vectorizer.build_tokenizer() self.preprocessor = self._vectorizer.build_preprocessor() self.spellchecker = enchant.DictWithPWL("en_US", pwl=constants.PERSONAL_WORD_DICTIONARY_FILE) def transform_text(self, raw_text): tokens = [] for token in self.tokenizer(raw_text): clean_token = self.preprocessor(token) if not self.spellchecker.check(clean_token): corrections = self.spellchecker.suggest(clean_token) if len(corrections) > 0: clean_token = corrections[0] tokens.append(clean_token) return ' '.join(tokens) def sub_numbers(self, text): return sub("[0-9]+", " numbr ", text)
def test_vectorizer(): # raw documents as an iterator train_data = iter(ALL_FOOD_DOCS[:-1]) test_data = [ALL_FOOD_DOCS[-1]] n_train = len(ALL_FOOD_DOCS) - 1 # test without vocabulary v1 = CountVectorizer(max_df=0.5) counts_train = v1.fit_transform(train_data) if hasattr(counts_train, 'tocsr'): counts_train = counts_train.tocsr() assert_equal(counts_train[0, v1.vocabulary_["pizza"]], 2) # build a vectorizer v1 with the same vocabulary as the one fitted by v1 v2 = CountVectorizer(vocabulary=v1.vocabulary_) # compare that the two vectorizer give the same output on the test sample for v in (v1, v2): counts_test = v.transform(test_data) if hasattr(counts_test, 'tocsr'): counts_test = counts_test.tocsr() vocabulary = v.vocabulary_ assert_equal(counts_test[0, vocabulary["salad"]], 1) assert_equal(counts_test[0, vocabulary["tomato"]], 1) assert_equal(counts_test[0, vocabulary["water"]], 1) # stop word from the fixed list assert_false("the" in vocabulary) # stop word found automatically by the vectorizer DF thresholding # words that are high frequent across the complete corpus are likely # to be not informative (either real stop words of extraction # artifacts) assert_false("copyright" in vocabulary) # not present in the sample assert_equal(counts_test[0, vocabulary["coke"]], 0) assert_equal(counts_test[0, vocabulary["burger"]], 0) assert_equal(counts_test[0, vocabulary["beer"]], 0) assert_equal(counts_test[0, vocabulary["pizza"]], 0) # test tf-idf t1 = TfidfTransformer(norm='l1') tfidf = t1.fit(counts_train).transform(counts_train).toarray() assert_equal(len(t1.idf_), len(v1.vocabulary_)) assert_equal(tfidf.shape, (n_train, len(v1.vocabulary_))) # test tf-idf with new data tfidf_test = t1.transform(counts_test).toarray() assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary_))) # test tf alone t2 = TfidfTransformer(norm='l1', use_idf=False) tf = t2.fit(counts_train).transform(counts_train).toarray() assert_equal(t2.idf_, None) # test idf transform with unlearned idf vector t3 = TfidfTransformer(use_idf=True) assert_raises(ValueError, t3.transform, counts_train) # test idf transform with incompatible n_features X = [[1, 1, 5], [1, 1, 0]] t3.fit(X) X_incompt = [[1, 3], [1, 3]] assert_raises(ValueError, t3.transform, X_incompt) # L1-normalized term frequencies sum to one assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) train_data = iter(ALL_FOOD_DOCS[:-1]) tv = TfidfVectorizer(norm='l1') assert_false(tv.fixed_vocabulary) tv.max_df = v1.max_df tfidf2 = tv.fit_transform(train_data).toarray() assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = tv.transform(test_data).toarray() assert_array_almost_equal(tfidf_test, tfidf_test2) # test transform on unfitted vectorizer with empty vocabulary v3 = CountVectorizer(vocabulary=None) assert_raises(ValueError, v3.transform, train_data) # ascii preprocessor? v3.set_params(strip_accents='ascii', lowercase=False) assert_equal(v3.build_preprocessor(), strip_accents_ascii) # error on bad strip_accents param v3.set_params(strip_accents='_gabbledegook_', preprocessor=None) assert_raises(ValueError, v3.build_preprocessor) # error with bad analyzer type v3.set_params = '_invalid_analyzer_type_' assert_raises(ValueError, v3.build_analyzer)
def test_vectorizer(): # raw documents as an iterator train_data = iter(ALL_FOOD_DOCS[:-1]) test_data = [ALL_FOOD_DOCS[-1]] n_train = len(ALL_FOOD_DOCS) - 1 # test without vocabulary v1 = CountVectorizer(max_df=0.5) counts_train = v1.fit_transform(train_data) if hasattr(counts_train, 'tocsr'): counts_train = counts_train.tocsr() assert counts_train[0, v1.vocabulary_["pizza"]] == 2 # build a vectorizer v1 with the same vocabulary as the one fitted by v1 v2 = CountVectorizer(vocabulary=v1.vocabulary_) # compare that the two vectorizer give the same output on the test sample for v in (v1, v2): counts_test = v.transform(test_data) if hasattr(counts_test, 'tocsr'): counts_test = counts_test.tocsr() vocabulary = v.vocabulary_ assert counts_test[0, vocabulary["salad"]] == 1 assert counts_test[0, vocabulary["tomato"]] == 1 assert counts_test[0, vocabulary["water"]] == 1 # stop word from the fixed list assert "the" not in vocabulary # stop word found automatically by the vectorizer DF thresholding # words that are high frequent across the complete corpus are likely # to be not informative (either real stop words of extraction # artifacts) assert "copyright" not in vocabulary # not present in the sample assert counts_test[0, vocabulary["coke"]] == 0 assert counts_test[0, vocabulary["burger"]] == 0 assert counts_test[0, vocabulary["beer"]] == 0 assert counts_test[0, vocabulary["pizza"]] == 0 # test tf-idf t1 = TfidfTransformer(norm='l1') tfidf = t1.fit(counts_train).transform(counts_train).toarray() assert len(t1.idf_) == len(v1.vocabulary_) assert tfidf.shape == (n_train, len(v1.vocabulary_)) # test tf-idf with new data tfidf_test = t1.transform(counts_test).toarray() assert tfidf_test.shape == (len(test_data), len(v1.vocabulary_)) # test tf alone t2 = TfidfTransformer(norm='l1', use_idf=False) tf = t2.fit(counts_train).transform(counts_train).toarray() assert not hasattr(t2, "idf_") # test idf transform with unlearned idf vector t3 = TfidfTransformer(use_idf=True) with pytest.raises(ValueError): t3.transform(counts_train) # test idf transform with incompatible n_features X = [[1, 1, 5], [1, 1, 0]] t3.fit(X) X_incompt = [[1, 3], [1, 3]] with pytest.raises(ValueError): t3.transform(X_incompt) # L1-normalized term frequencies sum to one assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train) # test the direct tfidf vectorizer # (equivalent to term count vectorizer + tfidf transformer) train_data = iter(ALL_FOOD_DOCS[:-1]) tv = TfidfVectorizer(norm='l1') tv.max_df = v1.max_df tfidf2 = tv.fit_transform(train_data).toarray() assert not tv.fixed_vocabulary_ assert_array_almost_equal(tfidf, tfidf2) # test the direct tfidf vectorizer with new data tfidf_test2 = tv.transform(test_data).toarray() assert_array_almost_equal(tfidf_test, tfidf_test2) # test transform on unfitted vectorizer with empty vocabulary v3 = CountVectorizer(vocabulary=None) with pytest.raises(ValueError): v3.transform(train_data) # ascii preprocessor? v3.set_params(strip_accents='ascii', lowercase=False) processor = v3.build_preprocessor() text = ("J'ai mangé du kangourou ce midi, " "c'était pas très bon.") expected = strip_accents_ascii(text) result = processor(text) assert expected == result # error on bad strip_accents param v3.set_params(strip_accents='_gabbledegook_', preprocessor=None) with pytest.raises(ValueError): v3.build_preprocessor() # error with bad analyzer type v3.set_params = '_invalid_analyzer_type_' with pytest.raises(ValueError): v3.build_analyzer()
class sentMod: def sequence_setup(self, X_train): self.vectorizer = CountVectorizer( binary=True, stop_words=stopwords.words('english'), min_df=3, max_df=0.9, max_features=None) X_train_onehot = self.vectorizer.fit_transform(X_train) #They take word-ids as input, so we first have to transform the input into a series of word ids self.word2idx = { word: idx for idx, word in enumerate(self.vectorizer.get_feature_names()) } self.tokenize = self.vectorizer.build_tokenizer() self.preprocess = self.vectorizer.build_preprocessor() X_train_sequences = [ to_sequence(self.tokenize, self.preprocess, self.word2idx, x) for x in X_train ] self.MAX_SEQ_LENGHT = len(max(X_train_sequences, key=len)) self.N_FEATURES = len(self.vectorizer.get_feature_names()) X_train_sequences = pad_sequences(X_train_sequences, maxlen=self.MAX_SEQ_LENGHT, value=self.N_FEATURES) return X_train_sequences def create_model(self): # load training data X_train, X_test, y_train, y_test = fill_set() # setup preprocessing tools for embeddings X_train_sequences = self.sequence_setup(X_train) #Prepare model self.model = Sequential() self.model.add( Embedding(len(self.vectorizer.get_feature_names()) + 1, 64, input_length=self.MAX_SEQ_LENGHT)) self.model.add(Conv1D(64, 5, activation='relu')) self.model.add(MaxPooling1D(5)) self.model.add(Flatten()) self.model.add( Dense(units=500, activation='relu', input_dim=len(self.vectorizer.get_feature_names()))) self.model.add(Dense(units=1, activation='sigmoid')) self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) self.model.summary() self.model.fit(X_train_sequences[:-100], y_train[:-100], epochs=3, batch_size=512, verbose=1, validation_data=(X_train_sequences[-100:], y_train[-100:])) # Test the out accuracy print("Accuracy:", self.get_accuracy()) # Save the model to the disk self.model.save(f'sentimentModel') print('Sentiment Model Saved to Disk!') def __init__(self, training="data/mc_training.csv", testing="data/mc_testing.csv"): if os.path.exists("sentimentModel/") == False: self.create_model() else: X_train = fill_set(training, testing)[0] self.sequence_setup(X_train) self.model = load_model("sentimentModel/") self.get_accuracy() def format_predict(self, data): temp_sequences = [ to_sequence(self.tokenize, self.preprocess, self.word2idx, x) for x in data ] temp_sequences = pad_sequences(temp_sequences, maxlen=self.MAX_SEQ_LENGHT, value=self.N_FEATURES) return temp_sequences def get_accuracy(self): x, X_test, y, y_test = fill_set() X_test_sequences = self.format_predict(X_test) scores = self.model.evaluate(X_test_sequences, y_test, verbose=1) self.accuracy = scores[1] return scores[1] def get_results(self): x, X_test, y, y_test = fill_set() predictions = self.model.predict(self.format_predict(X_test)) result = [] for pred in predictions: result.append(pred[0]) return result def predict(self, tests, pretty=False): if pretty == False: return self.model.predict(self.format_predict(tests)) else: predictions = self.model.predict(self.format_predict(tests)) i = 0 #print(len(predictions)) for pred in predictions: print(tests[i] + ": " + str(pred[0])) i += 1
model.fit(X_train_onehot[:-100], y_train[:-100], epochs=2, batch_size=128, verbose=1, validation_data=(X_train_onehot[-100:], y_train[-100:])) scores = model.evaluate(vectorizer.transform(X_test), y_test, verbose=1) print("Accuracy:", scores[1]) # Accuracy: 0.875 word2idx = { word: idx for idx, word in enumerate(vectorizer.get_feature_names()) } tokenize = vectorizer.build_tokenizer() preprocess = vectorizer.build_preprocessor() def to_sequence(tokenizer, preprocessor, index, text): words = tokenizer(preprocessor(text)) indexes = [index[word] for word in words if word in index] return indexes print(to_sequence(tokenize, preprocess, word2idx, "This is an important test!")) # [2269, 4453] X_train_sequences = [ to_sequence(tokenize, preprocess, word2idx, x) for x in X_train ] print(X_train_sequences[0])
clf_7 = Pipeline([ ('vect', TfidfVectorizer( stop_words=stop_words, token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b", )), ('clf', MultinomialNB(alpha=0.01)), ]) evaluate_cross_validation(clf_7, news.data, news.target, 5) ''' from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer() def my_tokenizer(s): return s.split() vectorizer = CountVectorizer(tokenizer=my_tokenizer) str = 'I am sure some bashers of Pens fans are pretty confused about the lack' print vectorizer.build_analyzer()(str) print vectorizer.build_tokenizer()(str) print vectorizer.build_preprocessor()(str) s1 = 'rạng sáng nay theo giờ hà_nội danh_hiệu cầu_thủ giá_trị mvp giải mls năm được công_bố tiền_đạo gốc việt_lee_nguyễn ứng_viên sáng_giá không kém đôi ngôi_sao đá giải ngoại_hạng robbie_keane los_angeles_galaxy obafemi_martins seattle_sounders bình_chọn dựa số phiếu clb dự mls giới truyền_thông cầu_thủ robbie_keane người số phiếu trận chung_kết mls cup robbie_keane los_angeles_galaxy giành danh_hiệu cầu_thủ giá_trị mls lee_nguyễn được đánh_giá cao bình_chọn ảnh espn lee_nguyễn xếp thứ_ba bình_chọn đạt tổng_số phiếu mùa lee_nguyễn ghi bàn năm pha kiến_tạo cuối giải thi_đấu ấn_tượng vai_trò cầm_trịch lối chơi ghi_bàn cho new_england_revolution vòng play off mls cup tiền vệ_sinh năm ghi thêm hai bàn ba pha kiến_tạo đưa revolution đoạt vô_địch mls khu_vực miền đông giành vé dự chung_kết mls cup đối_đầu đội bóng keane la galaxy tháng lee_nguyễn được hlv jurgen_klinsmann triệu_tập trở_lại tuyển mỹ nhờ phong_độ ấn_tượng mls cựu inter_milan newcastle_utd obafemi_martins đứng thứ_hai số phiếu bầu cầu_thủ clb phiếu bầu clb phiếu bầu truyền thông phiếu bầu cầu thủ tổng robbie_keane la galaxy obafemi_martins seattle_sounders lee_nguyễn new england rev bradley_wright phillips ny red_bulls tuấn' s2 = 'lee_nguyễn trải một năm thi_đấu hoàn_hảo ảnh usa today kết_quả được công_bố trang thông_tin chính_thức ban tổ_chức giải mls phần bình_luận tiền_vệ công lee_nguyễn đoạn lọt danh_sách bầu_chọn cuối_cùng cho danh_hiệu cầu_thủ giá_trị mls cho thấy lee_nguyễn một bước đột_phá sự_nghiệp nơi đanh ghi bàn đứng thứ_tư danh_sách vua_phá_lưới mùa vừa_qua tiền_vệ ghi_bàn cao lịch_sử mls chân chuyền đứng thứ_hai new_england năm pha kiến_tạo thành_công lee_nguyễn hoàn_toàn xứng_đáng lần đầu_tiên được lọt vào đội_hình tiêu_biểu mùa pha lập_công kiến_tạo lối chơi sáng_tạo ổn_định lee_nguyễn góp_phần quan_trọng làm_nên mùa giải thành_công rực_rỡ new_england_revolution họ nhì mls miền đông khi đăng_quang mls cup khu_vực đồng_nghĩa một suất vào chung_kết mls cup toàn_quốc nhờ lọt vào danh_sách rút_gọn cuối_cùng cho đua cầu_thủ giá_trị mvp robbie_keane los_angeles_galaxy obafemi_martins seattle_sounders bàn thắng gỡ hòa 1-1 vào lưới houston_dynamo tuần ngôi_sao sinh năm lọt danh_sách bốn bàn thắng đẹp mls sau bảy năm được gọi trở_lại đội_tuyển mỹ đội_hình tiêu_biểu mùa vừa_qua los_angles_galaxy đóng_góp nhiều ba cá_nhân chia đều hàng thủ đến hàng công đội bóng đối_thủ cạnh_tranh vô_địch mls cup lee_nguyễn revolution sân stubhub_center california ngày tới đội_hình tiêu_biểu mls mùa thủ_môn bill_hamid dc united hậu_vệ bobby_boswell dc united omar_gonzalez los_angeles_galaxy chad_marshall seattle_sounders tiền_vệ landon_donovan los_angeles_galaxy thierry_henry new_york_red_bulls lee_nguyễn new_england_revolution diego_valeri portland_timbers tiền_đạo robbie_keane los_angeles_galaxy obafemi_martins seattle_sounders fc bradley_wright phillips new_york_red_bulls đông_anh' s3 = 'thành_lương đỏ làm_nên tuyệt_phẩm trận đấu cuối_cùng bảng philippines ảnh giang_huy malaysia tập_trung hôm_qua để chuẩn_bị cho trận đấu tuyển việt_nam ngày sân_nhà shah_alam sau khi lách khe cửa hẹp để giành vị_trí thứ_hai bảng tay đội singapore thầy_trò salleh háo_hức muốn được kết_quả thật tốt một lời xin_lỗi để cđv nhà thất_vọng thời_gian gì phát_biểu có_thể thấy salleh nghiên_cứu kỹ báo_cáo hlv_u2 ong_kim_swee người được liên_đoàn bóng_đá malaysia fam cử sang hà_nội theo_dõi đối_thủ bảng trọng_tâm tuyển việt_nam đá giao_hữu tuyển việt_nam giải đấu nên phần_nào biết làm gì để kiềm_chế sức_mạnh họ salleh tiết_lộ báo_giới malaysia chúng tô đặc_biệt cẩn_trọng số nguyễn_văn_quyết số phạm_thành_lương cầu_thủ nguy_hiểm ong_kim_swee cho biết như_thế cầu_thủ văn_quyết đỏ chưa ghi_bàn được đối_thủ đánh_giá cao lối chơi ảnh giang_huy cá_nhân ong_kim_swee đưa nhận_xét tuyển việt_nam sau một thời_gian do_thám đội bóng xây_dựng được một phong_cách hoàn_toàn khác_biệt thời hlv người nhật_bản_toshiya_miura họ cầm bóng tốt không_bao_giờ chuyền bóng ngược sau luôn hướng lên phía miura sở_hữu cầu_thủ kỹ_thuật cá_nhân tốt malaysia cảnh_giác mỗi khi đối_phương bóng sát vòng cấm_địa việt_nam ghi hai bàn vào lưới philippines cú sút xa khi được hỏi điểm yếu tuyển việt_nam ong_kim_swee người giúp u23 malaysia vô_địch sea games tỏ bí_hiểm gì thấy một tập_thể gắn_kết mỗi vị_trí đều điểm yếu họ để thủng lưới ba lần điểm yếu có_thể tận_dụng khai_thác hlv salleh đen âm_thầm chuẩn_bị kế_hoạch gây bất_ngờ tuyển việt_nam sân_nhà ảnh ts bên_cạnh việc tìm cách phong_tỏa hai ngòi_nổ tuyển việt_nam salleh cố_gắng giải_quyết khoảng_trống shukor_adan mohd_amri_yahya để hai cầu_thủ trụ_cột đều vắng_mặt trận lượt_đi án treo_giò indra_putra_mahyuddin kunanlan manaf_mamat đều có_thể được tung vào sân_sau khi minh_chứng được khả_năng buổi tập safiq_rahim mohd_muslim có_thể đá vị_trí tiền_vệ trụ thay_thế cho shukor_adan salleh tiết_lộ ít_nhiều khung đội_hình thi_đấu cuối tuần người thay_thế amri_yahya trận đấu kulanan hoặc manaf_mamat tuấn' corpus = [s1, s2, s3] print 'DOne'
#print "testdata" #print len(test_data) #test_data = df1.iloc[:,1] vctr = CountVectorizer(stop_words='english',min_df = 1) vctr2 = HashingVectorizer(stop_words='english') vctr1 = TfidfVectorizer(stop_words='english') count_pos = 0 count_neg = 0 ###################################################################################################### train = [] test = [] for i in range(len(train_data)): string = train_data[i,0] #print string,i string = vctr.build_preprocessor()(string.lower()) string = vctr.build_tokenizer()(string.lower()) train.append(' '.join(string)) for i in range(len(test_data)): string = test_data[i,0] string = vctr.build_preprocessor()(string.lower()) string = vctr.build_tokenizer()(string.lower()) test.append(' '.join(string)) #print "len of the normalized test data obtained" #print len(test) ###################################################################################################### train_data = vctr.fit_transform(train).toarray() #print vctr1.inverse_transform(train_data) y_train = np.asarray(label_train, dtype="|S6") clf1 = GradientBoostingClassifier(n_estimators = 660)
class IngredientExtractor(object): """Estimator that identifies the most 'ingredient like' block from a list """ def __init__(self): """Constructor method of ingredient extractor""" pass def fit(self, X, y=None): """Fitter method of ingredient extractor X is an iterable of ingredient lists in the form of strings y is just here for compatibility in sklearn pipeline usage """ self._count_vect = CountVectorizer() self.vectorized_texts_ = self._count_vect.fit_transform(X) self.vocabulary_ = self._count_vect.vocabulary_ self.mean_corpus_ = self.vectorized_texts_.mean(axis=0) return (self) def predict(self, X): """Predicter method of ingredient extractor X is a list of text blocks. This methods returns the index of the text block that is most likely to hold the ingredient list""" X_against_ingred_voc = self._count_vect.transform(X) X_norms = sparse_norm(CountVectorizer().fit_transform(X), axis=1) X_dot_ingred = np.array(X_against_ingred_voc.sum(axis=1)).squeeze() pseudo_cosine_sim = np.divide(X_dot_ingred, X_norms, out=np.zeros(X_norms.shape), where=X_norms != 0) self.similarity_ = pseudo_cosine_sim return (np.argmax(pseudo_cosine_sim)) def show_emphasize(self, X): """Method that prints strings with words from vocabulary emphasized """ for text in self.emphasize_texts(X): print(text) def emphasize_texts(self, X): """Method that returns strings with words from vocabulary emphasized This method shows how some candidates texts are projected on the vocabulary that has been provided or gotten from fitting. It is useful to see how different blocks compare. X argument is an iterable of block candidates. """ check_is_fitted(self) preprocessor = self._count_vect.build_preprocessor() tokenizer = self._count_vect.build_tokenizer() vocabulary = self._count_vect.vocabulary_ emphasized_texts = [] for block in X: text = self.emphasize_words( block, preprocessor=preprocessor, tokenizer=tokenizer, vocabulary=vocabulary, ) emphasized_texts.append(text) return (emphasized_texts) def emphasize_words( self, text, preprocessor=None, tokenizer=None, vocabulary=None, ansi_color='\033[92m', # green by default ): """Method that returns a string with words emhasized This methods takes a string and returns a similar string with the words emphasized (with color markers) """ check_is_fitted(self) ansi_end_block = '\033[0m' if not preprocessor: preprocessor = self._count_vect.build_preprocessor() if not tokenizer: tokenizer = self._count_vect.build_tokenizer() if not vocabulary: vocabulary = self._count_vect.vocabulary_ preprocessed_text = preprocessor(text) tokenized_text = tokenizer(preprocessed_text) idx = 0 emphasized_text = '' for token in tokenized_text: if token in vocabulary: while preprocessed_text[idx:idx + len(token)] != token: emphasized_text += text[idx] idx += 1 emphasized_text += (ansi_color + text[idx:idx + len(token)] + ansi_end_block) idx += len(token) emphasized_text += text[idx:] return (emphasized_text) def score(self, X, y): """Scorer method of ingredient extractor estimator X is an iterable of ingredient lists in the form of string y is the target as the index of the correct block. """ pass
labels = ['No Default', 'Default'] plt.figure(figsize=(8,6)) sns.heatmap(cm,xticklabels=labels, yticklabels=labels, annot=True, fmt='d', cmap="Blues", vmin = 0.2); plt.title('Confusion Matrix') plt.ylabel('True Class') plt.xlabel('Predicted Class') plt.show() # In[126]: #Trying a Convolutional Neural Network with word sequences word2idx = {word: idx for idx, word in enumerate(vect.get_feature_names())} tokenize = vect.build_tokenizer() preprocess = vect.build_preprocessor() def to_sequence(tokenizer, preprocessor, index, text): words = tokenizer(preprocessor(text)) indexes = [index[word] for word in words if word in index] return indexes X_train_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in df_train.TEXT] MAX_SEQ_LENGTH = len(max(X_train_sequences, key=len)) print("MAX_SEQ_LENGTH=", MAX_SEQ_LENGTH) from keras.preprocessing.sequence import pad_sequences N_FEATURES = len(vect.get_feature_names()) X_train_sequences = pad_sequences(X_train_sequences, maxlen=MAX_SEQ_LENGTH, value=N_FEATURES) print(X_train_sequences[0]) from keras.models import Sequential from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Embedding model = Sequential() model.add(Embedding(len(vect.get_feature_names()) + 1,
## Based on https://github.com/jc-healy/EmbedAllTheThings/commit/da9fd638af573e3cfdd41d7f7fdd3dfe02f1e7cd#diff-a1268b7d09e1e7b148cb6028dda26bff from collections import defaultdict import numpy as np import numba import scipy.sparse # Just steal CountVectorizer for now; fix later from sklearn.feature_extraction.text import CountVectorizer _CV_INSTANCE = CountVectorizer() _tokenizer = _CV_INSTANCE.build_tokenizer() _preprocessor = _CV_INSTANCE.build_preprocessor() # End stealing CountVectorizer # Use nltk for senticizing for now import nltk nltk.download('punkt') def nltk_sentencizer(text): return nltk.sent_tokenize(text) # End nltk stealing def regex_tokenizer(text): return _tokenizer(text)
fastTextModelDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../fastText_demo_model/') rawTextFile = os.path.join(fastTextModelDir, 'arxiv-untagged-data-2020.txt') preprocessedTextFile = os.path.join(fastTextModelDir, 'arxiv-2020-preprocessed.txt') if __name__ == "__main__": # Construct vectorizer object accepting only lowercase letters cv = CountVectorizer(input='file', stop_words=stopwords, token_pattern=r"(?u)\b[a-z][a-z]+\b") # Function to strip accents and lowercase letters preprocess = cv.build_preprocessor() # Function to split into tokens using the above regex and excluding our stopwords tokenize = cv.build_tokenizer() inp = open(rawTextFile, "r") outp = open(preprocessedTextFile, "w") while True: line = inp.readline() if not line: break preprocessedLine = preprocess(line) tokenizedLine = tokenize(preprocessedLine) outp.write(" ".join(tokenizedLine)) outp.write("\n") inp.close()
def process_files(self, *filenames, stop_after_rows=None, overwrite_output_files=True, output_files_prefix=''): self.max_post_tokens = 0 self.max_resp_tokens = 0 random.seed(RANDOM_SEED) '''Preprocess the post and label data from the given files. If stop_after_rows is given, this process stops after that many file rows (even if not all of the files are reached, as such).''' data = pd.read_csv(filenames[0]).values for filename in filenames[1:]: data = np.append(data, pd.read_csv(filename).values, axis=0) posts = data[:stop_after_rows,1] r = data[:stop_after_rows,3] responses = [] # print(responses[0]) post_vectorizer = CountVectorizer() resp_vectorizer = CountVectorizer(token_pattern=r'(?u)\b\w+\b') # want to keep 1-char words in the responses when tokenizing them post_preprocessor = post_vectorizer.build_preprocessor() self.post_tokenizer = post_vectorizer.build_tokenizer() # seq2seq also uses this self.resp_tokenizer = resp_vectorizer.build_tokenizer() # seq2seq also uses this list_of_all_posts = np.empty(0) Y = np.empty(0) print("Preprocessing progress (by rows of original data):") for i in range(posts.shape[0]): if i % 100 == 0: print("%.0f%%" % (i*100/posts.shape[0])) row_posts_string = post_preprocessor(posts[i]) # preprocess the posts in this row (including making them lowercase) row_posts_list = re.split(r'\n\d+\.', row_posts_string) # split up all the posts in a given row j = 1 for post in row_posts_list: post = post.strip("1.").strip() # remove any prepended "1." (that's the only case the regex split doesn't take care of), and then any prepended space/tab characters and any appended newline(s) post = re.sub(r'\.|,|;|:|\?|!|\(|\)|"|\u201C|\u201D', '', post) # remove certain punctuation # remove stopwords post = re.sub(r'\u2018|\u2019', "'", post) # replace smart (curly) apostrophes with ASCII apostrophes, since that's what nltk uses post_words = post.split() post_words = list(filter(lambda word: word not in STOPWORDS, post_words)) post = " ".join(post_words) # get rid of URLs post = re.sub( r'http\S+', '', post ) # TODO: potential further preprocessing ideas: # emojis -- not sure, might want to leave them (although we've already gotten rid of some punctuation and therefore punctuation-emojis, currently) # address misspelling of significant words if len(self.post_tokenizer(post)) > self.max_post_tokens: self.max_post_tokens = len(self.post_tokenizer(post)) list_of_all_posts = np.append(list_of_all_posts, post) # add it to our 1D numpy array of all posts # Check if theres no response if type(data[i,2]) != float: # it's a string representation of a list # Remove brackets from idx entries temp = data[i,2].replace('[', '') temp = temp.replace(']', '') # Convert the string representation to an actual list of ints temp_arr = list(map(lambda a: int(a), temp.split(','))) #If post matches hate_speech_idx, add 1 to Y if j in temp_arr: # the jth post in this row is marked as hate speech Y = np.append(Y, 1) row_resps = ast.literal_eval(data[i,3]) row_max_resp_tokens = max(map(lambda resp: len(self.resp_tokenizer(resp)), row_resps)) if row_max_resp_tokens > self.max_resp_tokens: self.max_resp_tokens = row_max_resp_tokens responses.append(random.choice(row_resps).lower()) else: # the jth post in this row is marked as not hate speech Y = np.append(Y, 0) else: # it's 'n/a', which gets parsed as nan apparently. So none of these posts are marked as hate Y = np.append(Y, 0) j += 1 print("100%") process_responses(responses) # print(responses[0]) # print(responses[1]) # print(responses[2]) # print(responses[3]) counts = post_vectorizer.fit_transform(list_of_all_posts) # counts in a 2D matrix counts_np = np.array(counts.toarray()) # convert to normal numpy format feature_names = post_vectorizer.get_feature_names() # the 1D python list of features (i.e. words) that correspond to the columns of counts_np feature_names_np = np.array(feature_names) # convert to numpy resp_vectorizer.fit(responses) resp_tokens = resp_vectorizer.get_feature_names() # a 1D python list of all the tokens (probably words) used in the processed responses resp_tokens_np = np.array(resp_tokens) responses=np.array(responses) # remove unique features/columns (i.e. words that appear only in one post throughout the corpus) non_unique_indices = np.nonzero(np.count_nonzero(counts_np,axis=0)>1)[0] # the column indices of the features that appear in more than one document throughout the corpus non_unique_counts_np = counts_np[:,non_unique_indices] # select only the columns at those indices non_unique_feature_names_np = feature_names_np[non_unique_indices] # select only the feature names at those indices if overwrite_output_files: np.savez_compressed('data/' + output_files_prefix + 'preprocessed_data.npz', post_word_counts=non_unique_counts_np, post_feature_names=non_unique_feature_names_np, post_labels=Y, post_texts=list_of_all_posts, post_tokens=feature_names_np, response_texts=responses, resp_tokens=resp_tokens_np) with open('data/' + output_files_prefix + 'preprocessor.pkl', 'wb') as obj_file: pickle.dump(self, obj_file, pickle.HIGHEST_PROTOCOL) return {'post_word_counts': non_unique_counts_np, 'post_feature_names': non_unique_feature_names_np, 'post_labels': Y, 'post_texts': list_of_all_posts, 'post_tokens': feature_names_np, 'response_texts': responses, 'resp_tokens': resp_tokens_np}