def build_vocab(text, emb, emb_dim=300, max_df=.7, max_features=20000, stop_words= 'english'): ''' Fit vocabulary :param text: list of documents for creating vocabulary :return: vectorizer ''' vect = CountVectorizer(stop_words=stop_words, max_df=max_df, max_features=max_features, token_pattern=r"(?u)[!\"#\$\%&\'()\*\+,-./:;<=>\?@\[\\\]\^_`{|}~\w]+") vect.fit(text) no_embedding = [k for k in vect.vocabulary_.keys() if k not in emb] print("No Embeddings for: ") print(len(no_embedding)) vocab = [k for i, k in enumerate(vect.vocabulary_.keys()) if k in emb] new_vocab = dict([(k, i + 2) for i, k in enumerate(vocab)]) # Set 0 to be the padding index, 1 to be unk vect.vocabulary_ = new_vocab print('Vocabulary size: ', len(new_vocab)) embedding = np.zeros(shape=(len(new_vocab) + 2, emb_dim)) for k,i in new_vocab.items(): embedding[i] = emb[k] return vect, embedding
def text_classifly_twang(dataset_dir_name, fs_method, fs_num): print 'Loading dataset, 80% for training, 20% for testing...' movie_reviews = load_files(dataset_dir_name) doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0) print 'Feature selection...' print 'fs method:' + fs_method, 'fs num:' + str(fs_num) vectorizer = CountVectorizer(binary = True) word_tokenizer = vectorizer.build_tokenizer() doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train] term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num] print 'Building VSM model...' term_dict = dict(zip(term_set_fs, range(len(term_set_fs)))) vectorizer.fixed_vocabulary = True vectorizer.vocabulary_ = term_dict doc_train_vec = vectorizer.fit_transform(doc_str_list_train) doc_test_vec= vectorizer.transform(doc_str_list_test) clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train) #调用MultinomialNB分类 doc_test_predicted = clf.predict(doc_test_vec) acc = np.mean(doc_test_predicted == doc_class_list_test) print 'Accuracy: ', acc return acc
def text_classifly_twang(dataset_dir_name, fs_method, fs_num): print('Loading dataset, 80% for training, 20% for testing...') movie_reviews = load_files(dataset_dir_name) doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split( movie_reviews.data, movie_reviews.target, test_size=0.2, random_state=0) print('Feature selection...') print('fs method:' + fs_method, 'fs num:' + str(fs_num)) vectorizer = CountVectorizer(binary=True) word_tokenizer = vectorizer.build_tokenizer() doc_terms_list_train = [ word_tokenizer(doc_str) for doc_str in doc_str_list_train ] term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num] print('Building VSM model...') term_dict = dict(zip(term_set_fs, range(len(term_set_fs)))) vectorizer.fixed_vocabulary = True vectorizer.vocabulary_ = term_dict doc_train_vec = vectorizer.fit_transform(doc_str_list_train) doc_test_vec = vectorizer.transform(doc_str_list_test) clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train) # µ÷ÓÃMultinomialNB·ÖÀàÆ÷ doc_test_predicted = clf.predict(doc_test_vec) acc = np.mean(doc_test_predicted == doc_class_list_test) print('Accuracy: ', acc) return acc
def build_vocab( text, negation=False, max_df=.7, max_features=20000, vecPath='/ifs/data/razavianlab/ehr_ssp_embedding/word2CurDiag_ge5_5.tsv', stopWordPath='/ifs/data/razavianlab/stop_words.txt', torch=True): ''' Fit vocabulary and create PubMed w2v matrix :param text: list of documents for creating vocabulary :return: embedding matrix and vectorizer ''' #import torchwordemb #load w2v #w2v_vocab, vec = torchwordemb.load_word2vec_bin("./data/PubMed-and-PMC-w2v.bin") w2v_vocab, vec = load_star_space(vecPath, torch) #vect = CountVectorizer(stop_words = 'english',max_df = max_df, max_features = max_features) stopWords = stopwords2(stopWordPath) vect = CountVectorizer(stop_words=stopWords, max_df=max_df, max_features=max_features) vect.fit(text) no_embedding = [k for k in vect.vocabulary_.keys() if k not in w2v_vocab] print("No Embeddings for: ") print(len(no_embedding)) vocab = dict([(k, w2v_vocab[k]) for k in vect.vocabulary_.keys() if k in w2v_vocab]) new_vocab = dict([(k, i + 1) for i, k in enumerate(vocab.keys()) ]) # Set 0 to be the padding index if torch: embedding = torch.zeros(len(new_vocab) + 1, vec.shape[1]) else: embedding = np.zeros(shape=(len(new_vocab) + 1, vec.shape[1])) for k, i in new_vocab.items(): embedding[i] = vec[vocab[k]] if negation: n_emb = embedding.size()[0] - 1 neg_emb = -1 * embedding if torch: embedding = torch.cat([embedding, neg_emb], 0) else: embedding = np.concatenate([embedding, neg_emb], 0) for k, v in new_vocab.items(): new_vocab[k + '_NEG'] = v + n_emb vect.vocabulary_ = new_vocab return embedding, vect
def build_vocab(text, max_df=.7, max_features=20000, stop_words= 'english', analyzer = 'word', ngram_range=(1, 1)): ''' Fit vocabulary :param text: list of documents for creating vocabulary :return: vectorizer ''' vect = CountVectorizer(stop_words=stop_words, max_df=max_df, max_features=max_features, analyzer=analyzer, ngram_range=ngram_range) vect.fit(text) new_vocab = dict([(k, i + 1) for i, k in enumerate(vect.vocabulary_.keys())]) # Set 0 to be the padding index vect.vocabulary_ = new_vocab print('Vocabulary size: ', len(new_vocab)) return vect
def feats(X_train_terms, X_train_texts, train_Y, X_test_texts, sel_feat_method=None, K=None): count_vect = CountVectorizer() tfidf_transformer = TfidfTransformer() # 这里使用的是tf-idf if sel_feat_method != None: term_dict = sel_terms(X_train_terms, train_Y, sel_feat_method, K) count_vect.fixed_vocabulary = True count_vect.vocabulary_ = term_dict X_train_counts = count_vect.transform(X_train_texts) X_train_feats = tfidf_transformer.fit_transform(X_train_counts) X_test_counts = count_vect.transform(X_test_texts) # 构建文档计数 X_test_feats = tfidf_transformer.transform(X_test_counts) # 构建文档tfidf return X_train_feats, X_test_feats
def load( cls, meta: Dict[Text, Any], model_dir: Text = None, model_metadata: Metadata = None, cached_component: Optional["CountVectorsFeaturizer"] = None, **kwargs: Any ) -> "CountVectorsFeaturizer": from sklearn.feature_extraction.text import CountVectorizer file_name = meta.get("file") featurizer_file = os.path.join(model_dir, file_name) if os.path.exists(featurizer_file): vocabulary = utils.json_unpickle(featurizer_file) vectorizer = CountVectorizer() vectorizer.vocabulary_ = vocabulary return cls(meta, vectorizer) else: return cls(meta)
def bow_matrix(train_text, test_text, max_features, load_path=None, save_path=None): vectorizer = CountVectorizer(max_features=max_features, preprocessor=lambda x: x, tokenizer=lambda x: x) if load_path: vectorizer.vocabulary_ = utils.load_pickle(load_path) features_train = vectorizer.transform(train_text).toarray() else: features_train = vectorizer.fit_transform(train_text).toarray() vocabulary = vectorizer.vocabulary_ feature_names = vectorizer.get_feature_names() features_test = vectorizer.transform(test_text).toarray() new_train_df = pd.DataFrame(data=features_train, columns=feature_names) new_test_df = pd.DataFrame(data=features_test, columns=feature_names) if save_path: utils.save_pickle(vocabulary, save_path) return new_train_df, new_test_df, vocabulary
def load_encoders(): """Loads the encoders built during `build_encoders`. # Returns encoders: A dict of encoder objects/specs. """ encoders = {} # Text tokenizer = CountVectorizer(max_features=10000) module = "webcompat_ml.models.invalid.encoders" filename = "model_vocab.json" with importlib_path(module, filename) as path: with open(path, "r", encoding="utf8", errors="ignore") as infile: tokenizer.vocabulary_ = json.load(infile) encoders["tokenizer"] = tokenizer # labels labels_encoder = LabelBinarizer() filename = "labels_encoder.json" with importlib_path(module, filename) as path: with open(path, "r", encoding="utf8", errors="ignore") as infile: labels_encoder.classes_ = json.load(infile) encoders["labels_encoder"] = labels_encoder # Target Field: invalid invalid_encoder = LabelEncoder() filename = "invalid_encoder.json" with importlib_path(module, filename) as path: with open(path, "r", encoding="utf8", errors="ignore") as infile: invalid_encoder.classes_ = np.array(json.load(infile)) encoders["invalid_encoder"] = invalid_encoder return encoders
def predict(self, X_test): """Method to make predictions, vectorized implementation Args X_test: pandas dataframe or dataseries. test set. Return: y_pred: pandas dataseries. predictions of model. """ ## exact match unique_classes = np.unique( self.df_association_rules['consequent_sentiment'].to_numpy( )).reshape(1, -1) # params for array dimensions num_rules = self.df_association_rules.shape[0] num_examples = X_test.shape[0] num_classes = unique_classes.shape[1] E = np.repeat( self.df_association_rules['antecedents'].to_numpy().reshape(1, -1), num_examples, axis=0) R = np.repeat(X_test.to_numpy().reshape(-1, 1), num_rules, axis=1) A = np.repeat(self.df_association_rules['consequent_sentiment']. to_numpy().reshape(-1, 1), num_classes, axis=1) B = np.repeat(unique_classes, num_rules, axis=0) C = self.df_association_rules['confidence'].to_numpy().reshape(-1, 1) exact_match_confidence = np.matmul(E == R, np.multiply((A == B), C)) ## partial match my_vocabulary = list(self.df_association_rules[ self.df_association_rules['antecedents'].apply( lambda x: len(x.split())) == 1]['antecedents']) my_vocabulary_dict = {} for i, vocab in enumerate(my_vocabulary): my_vocabulary_dict[vocab] = i vectorizer = CountVectorizer( lowercase=False, token_pattern='[a-zA-Z0-9$&+,:;=?@#|<>.^*()%!-]+') vectorizer.fit_transform(my_vocabulary_dict) vectorizer.vocabulary_ = my_vocabulary_dict tf = vectorizer.transform(X_test.apply(lambda s: s.replace(',', ''))) tf = tf.todense() df_unique_ass_rules = self.df_association_rules[ self.df_association_rules['antecedents'].apply( lambda x: len(x.split())) == 1] num_rules = df_unique_ass_rules.shape[0] A = np.repeat( df_unique_ass_rules['consequent_sentiment'].to_numpy().reshape( -1, 1), num_classes, axis=1) B = np.repeat(unique_classes, num_rules, axis=0) C = df_unique_ass_rules['confidence'].to_numpy().reshape(-1, 1) conf_sums = np.matmul(tf, np.multiply((A == B), C)) conf_counts = np.matmul(tf, np.multiply((A == B), C) != 0) conf_counts = ((conf_counts == 0) * 0.000001) + conf_counts partial_match_confidence = np.divide(conf_sums, conf_counts) ## if exact match: exact_match_confidence else partial_match_confidence if_not_exact_match = (exact_match_confidence.sum(axis=1) == 0).reshape( -1, 1) confidence = exact_match_confidence + np.multiply( if_not_exact_match, partial_match_confidence) ## if there are no exact or partial matches predict the most common class classes = np.repeat(unique_classes, num_examples, axis=0) if 'neutral' in list(classes[0]): i = list(classes[0]).index('neutral') confidence[:, i:i + 1] = confidence[:, i:i + 1] + (confidence.sum(axis=1) == 0) * 0.6 elif 'positive' in list(classes[0]): i = list(classes[0]).index('positive') confidence[:, i:i + 1] = confidence[:, i:i + 1] + (confidence.sum(axis=1) == 0) * 0.3 ## get predictions classes = np.repeat(unique_classes, num_examples, axis=0) y_pred = classes[np.unravel_index(confidence.argmax(axis=1), classes.shape)] # confidence for predict_proba self.confidence = confidence y_pred = y_pred.reshape(-1, ) # return prediction return y_pred
# preprocessed data loads train_ctxt_tfidfed = load_sparse_csr(train_ctxt_tfidfed_file) with contextlib.closing( bz2.BZ2File(train_gold_resp_preprocessed_file, 'rb')) as f: train_gold_resp_preprocessed = json.load(f) with contextlib.closing(bz2.BZ2File(train_alt_resp_preprocessed_file, 'rb')) as f: train_alt_resp_preprocessed = json.load(f) print('loaded data!') # prepare vocab, count_vect assert (len(count_vect_vocab) == VOCAB_SIZE) count_vect = CountVectorizer(tokenizer=my_tokenize) count_vect.vocabulary_ = count_vect_vocab vocab_dict = {x: i + 1 for i, x in enumerate(count_vect_vocab) } # +1 since 0 is for masking vocab_dict['UNK'] = len(vocab_dict) + 1 inv_vocab = {vocab_dict[x]: x for x in vocab_dict} # generators # train_gen = data_generator_raw(train_x, train_y, vocab_dict, count_vect, tfidf_transformer) train_gen = data_generator_preprocessed(train_ctxt_tfidfed, train_gold_resp_preprocessed, train_alt_resp_preprocessed, train_y) val_gen = data_generator_raw(val_x, val_y, vocab_dict, count_vect, tfidf_transformer)
inputData = inputEvent['vector'] ### Model input modelDirBase = os.getenv('DATA_DIR', '/model') modelId = os.getenv('MODEL_ID') modelDir = modelDirBase + "/" + modelId model = joblib.load(modelDir + '/model.pkl') docs_new = [inputData] newDocsNormalized = docs_new if (isTextData): from sklearn.feature_extraction.text import CountVectorizer countVectorizer = CountVectorizer() countVectorizer.vocabulary_ = joblib.load(modelDir + '/vocabulary.pkl') from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() tfidf_transformer._idf_diag = joblib.load(modelDir + '/tfidf.pkl') X_new_counts = countVectorizer.transform(docs_new) newDocsNormalized = tfidf_transformer.transform(X_new_counts) predicted = model.predict(newDocsNormalized) print docs_new response = {} response['response'] = predicted[0] open(baseDataDir + '/response.json', 'w').write(json.dumps(response)) print 'KPIPES:SUCCESS'
usecols=colnames) reviewTextDF = textData['review/text'] reviewText = reviewTextDF.tolist() uniqueValuesDF = data['Word'] uniqueValues = uniqueValuesDF.tolist() uniqueList = dict() i = 0 for word in uniqueValues: uniqueList[word] = i i = i + 1 # print(reviewText[0]) # text = ["Good good good taste great.***flavor ,my"] vectorizer = CountVectorizer(binary=True) vectorizer.vocabulary_ = uniqueList vectorList = list() for review in reviewText: review = [str(review)] vector = vectorizer.transform(review) vectorList.append(vector.toarray()) OUTPUT_FILE_NAME = "vectors.csv" outfile = open(OUTPUT_FILE_NAME, "w") header = "" for item in uniqueList: header = header + str(item) + ", " outfile.write(header + "\n") result = ""
def get_bagofwords(predicted_score, genre_numbers): # predicted_score : la note moyenne prédite pour le film virtuel (nombre) # genre_numbers : la liste du (des) indice(s) donnant le(s) genre(s) du film virtuel app = prodbox.CinemaService() app.loadFilms() app.loadGenres() app.loadReviews() app.loadReviewsContent() accuracy = 4. # le niveau de précision qui détermine quelles notes on considère comme proches de la note du film virtuel # On sélectionne d'abord les critiques pertinentes pour élaborer le bag of words : corpus = [] for i in range(len(app.films)): # on retient le film n° i s'il est de l'un des genres voulus... if max([app.genres_matrix[i, genre] == 1 for genre in genre_numbers]): # ... et on retient alors les critiques de ce film dont la note est proche de predicted_score : for journal in app.reviews_content[i].keys(): journal_index = app.reviews_names.index(journal) if abs(predicted_score - app.reviews_matrix[i, journal_index]) < .5 / accuracy: print "Note : " + str( app.reviews_matrix[i, journal_index] ) + ", critique : " + app.reviews_content[i][journal] corpus.append(app.reviews_content[i][journal]) # Une fois le corpus de critiques construit, on procède au comptage automatique des mots du dictionnaire v = CountVectorizer() dic = get_dictionary() # le dictionnaire d'adjectifs # On retire du dictionnaire certains mots positifs qui pourraient apparaître artificiellement, sans être pertinents pour un film mal noté : positive_adjectives = { 'perfect': .6, 'great': .6, 'epic': .6, 'artful': .6 } for adj, adj_positivity in positive_adjectives.items(): if predicted_score < adj_positivity: del dic[adj] # Puis on procède au comptage : v.vocabulary_ = dic X = v.transform(corpus).toarray() y = [sum(X[:, i]) for i in range(len(v.vocabulary_))] # on s'intéresse aux adjectifs qui apparaissent le plus souvent dans l'ensemble du corpus top_indexes = sorted(range(len(y)), key=lambda i: y[i])[-10:] inv_dic = {adj: key for key, adj in dic.items()} return {inv_dic[i]: y[i] for i in top_indexes if y[i] > 1}
news_train_df['headline'] = news_train_df['headline'].replace(np.nan, '') news_train_df['headlineTag'] = news_train_df['headlineTag'].replace(np.nan, '') # In[ ]: vect = CountVectorizer() vect.fit(list(news_train_df['headline'])) # In[ ]: list((vect.vocabulary_).items())[0:10] # In[ ]: vect.vocabulary_ = sorted(vect.vocabulary_.items(), key=lambda x: x[1], reverse=True) # In[ ]: (vect.vocabulary_)[0:10] # - ### n-gram # In[ ]: vect1 = CountVectorizer(ngram_range=(2, 2)) vect1.fit(list(news_train_df['headline'])) # In[ ]:
train['year'] = [str(pd.to_datetime(x).year) for x in train['created_time']] #print(train.info()) #print(train.loc[:,['hour','dayofweek','year']]) description = train['description'] summary = train['summary'] description = description.apply( lambda x: ' '.join(gensim.utils.simple_preprocess(x))) summary = summary.apply(lambda x: ' '.join(gensim.utils.simple_preprocess(x))) bow = CountVectorizer(max_features=5000, binary=True, max_df=0.5, ngram_range=(1, 2)) #bow = bow.fit(description+summary) description = bow.fit_transform(description) bow.vocabulary_ = None summary = bow.fit_transform(summary) #source = dummies.fit_transform(train[['source','num_votes']]) #this should be wrong but is issuing a smaller error # Decimal places Object that can be unambiguously recognized at this scale # 0 country or large region # 1 large city or district # 2 town or village # 3 neighborhood, street # 4 individual street, land parcel # 5 individual trees, door entrance # 6 individual humans # 7 practical limit of commercial surveying # 8 specialized surveying (e.g. tectonic plate mapping)
test_predictions = model.predict(X_test).flatten() pred1 = test_predictions[0::2] pred2 = test_predictions[1::2] plt.scatter(y_test[:, 0], pred1) plt.xlabel('True Values [MPG]') plt.ylabel('Predictions [MPG]') plt.show() plt.scatter(y_test[:, 1], pred2) plt.xlabel('True Values [MPG]') plt.ylabel('Predictions [MPG]') plt.show() #plt.axis('equal') #plt.axis('square') #plt.xlim([0,plt.xlim()[1]]) #plt.ylim([0,plt.ylim()[1]]) #_ = plt.plot([-100, 100], [-100, 100]) print('Enter title, platform, publisher, developer, genre:') test = [] #title = input() title = 'God of War PS4 Sony Sony' title = title.lower() title = [title] vectorizer.vocabulary_ = title_voc vector = vectorizer.transform(title) vector = vector.toarray() test = np.asarray(vector).astype(np.float32) test_predictions = model.predict(test).flatten() print('Prediction is {} '.format(test_predictions))
from sklearn.metrics import confusion_matrix cm = confusion_matrix(Y_test, Y_pred) (382 + 27764) / 33136 Baby_Trend_Diaper_Champ = dataset.groupby( by=['name']).get_group('Baby Trend Diaper Champ') corpus_1 = [] for i in range(0, 298): review_1 = re.sub('[^a-zA-Z]', ' ', dataset['review'][i]) review_1 = review_1.lower() corpus_1.append(review_1) from sklearn.feature_extraction.text import CountVectorizer cv_1 = CountVectorizer() X_1 = cv_1.fit_transform(corpus_1).toarray() cv_1.vocabulary_(selected_words) for j in range(0, 1): Sum = [sum(j) for j in zip(*Y_test)] Y_1 = Baby_Trend_Diaper_Champ.iloc[:, 4].values from sklearn.preprocessing import LabelEncoder labelencoder_1 = LabelEncoder() Y_1 = labelencoder_1.fit_transform(Y_1) from sklearn.model_selection import train_test_split X_train_1, X_test_1, Y_train_1, Y_test_1 = train_test_split(X_1, Y_1, test_size=0.2, random_state=0) from sklearn.linear_model import LogisticRegression
def get_bagofwords(predicted_score, genre_numbers): # predicted_score : la note moyenne prédite pour le film virtuel (nombre) # genre_numbers : la liste du (des) indice(s) donnant le(s) genre(s) du film virtuel app = prodbox.CinemaService() app.loadFilms() app.loadGenres() app.loadReviews() app.loadReviewsContent() accuracy = 4. # le niveau de précision qui détermine quelles notes on considère comme proches de la note du film virtuel # On sélectionne d'abord les critiques pertinentes pour élaborer le bag of words : corpus = [] for i in range(len(app.films)): # on retient le film n° i s'il est de l'un des genres voulus... if max([app.genres_matrix[i, genre] == 1 for genre in genre_numbers]): # ... et on retient alors les critiques de ce film dont la note est proche de predicted_score : for journal in app.reviews_content[i].keys(): journal_index = app.reviews_names.index(journal) if abs(predicted_score - app.reviews_matrix[i, journal_index]) < .5/accuracy: print "Note : " + str(app.reviews_matrix[i, journal_index]) + ", critique : " + app.reviews_content[i][journal] corpus.append(app.reviews_content[i][journal]) # Une fois le corpus de critiques construit, on procède au comptage automatique des mots du dictionnaire v = CountVectorizer() dic = get_dictionary() # le dictionnaire d'adjectifs # On retire du dictionnaire certains mots positifs qui pourraient apparaître artificiellement, sans être pertinents pour un film mal noté : positive_adjectives = {'perfect' : .6, 'great' : .6, 'epic' : .6, 'artful' : .6} for adj, adj_positivity in positive_adjectives.items(): if predicted_score < adj_positivity: del dic[adj] # Puis on procède au comptage : v.vocabulary_ = dic X = v.transform(corpus).toarray() y = [sum(X[:,i]) for i in range(len(v.vocabulary_))] # on s'intéresse aux adjectifs qui apparaissent le plus souvent dans l'ensemble du corpus top_indexes = sorted(range(len(y)), key = lambda i: y[i])[-10:] inv_dic = {adj : key for key, adj in dic.items()} return {inv_dic[i]: y[i] for i in top_indexes if y[i] > 1}