def classify_lyrics_pos(genre_lyrics_map): vectorizer = DictVectorizer() all_lyrics_pos_tags = [] all_lyrics_genres = [] for genre in genre_lyrics_map.keys(): genre_lyrics = genre_lyrics_map[genre] for song_lyrics in genre_lyrics: pos_tags_map = song_lyrics["features"]["pos_tags_map"] all_lyrics_pos_tags.append(pos_tags_map) all_lyrics_genres.append(genre) pos_train, pos_test, genres_train, genres_test = train_test_split(all_lyrics_pos_tags, all_lyrics_genres, test_size=0.33) vectorizer.fit(all_lyrics_pos_tags) vect = vectorizer.transform((all_lyrics_pos_tags)) print("vect = " + str(vect)) classifiers_to_use = get_classifiers() partial_fit_classifiers = classifiers_to_use["partial"] full_fit_classifiers = classifiers_to_use["full"] teach_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, pos_train, genres_train, app_data.LYRICS_GENRES_METAL) test_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, pos_test, genres_test) print_top_features(partial_fit_classifiers + full_fit_classifiers, vectorizer, app_data.LYRICS_GENRES_METAL)
def cat_vectorize(train_data, test_data, num_cols): # categorical attributes cat_train_data = train_data.drop(num_cols, axis=1) cat_test_data = test_data.drop(num_cols, axis=1) cat_train_data.fillna('NA', inplace=True) cat_test_data.fillna('NA', inplace=True) cat_train_data_values = cat_train_data.T.to_dict().values() cat_test_data_values = cat_test_data.T.to_dict().values() # vectorize (encode as one hot) vectorizer = DictVectorizer(sparse=False) vec_train_data = vectorizer.fit_transform(cat_train_data_values) vec_test_data = vectorizer.transform(cat_test_data_values) return vec_train_data, vec_test_data
class EntityDetectionPU(ClassifierMixin): def __init__(self, prior=.5, sigma=.1, lam=1, basis='gauss', n_basis=200): self.clf = SparsePU_SL(prior=prior, sigma=sigma, lam=lam, basis=basis, n_basis=n_basis) # self.clf = PU_SL(prior=prior, sigma=sigma, lam=lam, basis=basis, n_basis=n_basis) # self.clf = SVC() self.featureizer = DictVectorizer(sparse=True) def fit(self, X, y): x_feat = self.featureizer.fit_transform(X) self.clf.fit(x_feat, y) return self def predict(self, X): x_feat = self.featureizer.transform(X) return self.clf.predict(x_feat) def predict_sent(self, X_sent): for xi in X_sent: yield (self.predict([xi]))
# categorical attributes cat_train = X_train.drop(numeric_cols, axis=1) cat_test = X_test.drop(numeric_cols, axis=1) cat_train.fillna('NA', inplace=True) cat_test.fillna('NA', inplace=True) x_cat_train = cat_train.T.to_dict().values() x_cat_test = cat_test.T.to_dict().values() # vectorize (encode as one hot) vectorizer = DictVectorizer(sparse=False) vec_x_cat_train = vectorizer.fit_transform(x_cat_train) vec_x_cat_test = vectorizer.transform(x_cat_test) # build the feature vector x_train = np.hstack((x_num_train, vec_x_cat_train)) x_test = np.hstack((x_num_test, vec_x_cat_test)) #clfLR = LogisticRegression().fit(x_train, y_train.values) #pred = clfLR.predict(x_test) #print classification_report(y_test.values, pred, digits=4) #print accuracy_score(y_test.values, pred) clfTree = tree.DecisionTreeClassifier().fit(x_train, y_train) predict = clfTree.predict(x_test) #print classification_report(y_test.values, pred, digits=4)
class Model(object): """ Class for abstracting the different classification models. """ def __init__(self, train_tweets, train_targets, vect_options, tfidf_options, extra_params): self.grid_params = { # 'vect__ngram_range': [(1,1),(1,2),(2,2)], # 'tfidf__use_idf': (True,False), # 'tfidf__smooth_idf': (True, False), # 'tfidf__sublinear_tf': (True, False), } self.grid_params = dict(self.grid_params.items()+extra_params.items()) self.vect_options = vect_options self.tfidf_options = tfidf_options self.feature_set = {} self.train_tweets = train_tweets self.train_targets = train_targets self.only_text_features = False def train_on_feature_set(self, cross_validate=True, use_tfidf=True): """ Performs training with the given model using the given feature set """ #Establish document text feature vectors print "Vectorizing" # self.tokenizer = CountVectorizer().build_tokenizer() self.vect = CountVectorizer(**self.vect_options) self.tfidf_transformer = TfidfTransformer(**self.tfidf_options) self.dict_transformer = TfidfTransformer(**self.tfidf_options) # train_counts_tf = tfidf_transformer.fit_transform(train_counts) count_vector = self.vect.fit_transform([t.text for t in self.train_tweets]) tfidf_count = self.tfidf_transformer.fit_transform(count_vector) if self.only_text_features: combined_vector = tfidf_count else: self.dict_vectorizer = DictVectorizer() dict_vector = self.dict_vectorizer.fit_transform(self.feature_set) f=codecs.open("feature_set.txt", "w", "utf8") for d in dict_vector: f.write(d.__str__()) f.close() tfidf_dict = self.dict_transformer.fit_transform(dict_vector) f=codecs.open("feature_set_tdidf.txt", "w", "utf8") for d in tfidf_dict: f.write(d.__str__()) f.close() combined_vector = sp.hstack([tfidf_count, tfidf_dict]) # combined_features = FeatureUnion() #Crossvalidation cross_validation = StratifiedKFold(self.train_targets, n_folds=10) #Build a Pipeline with TFidfVectorizer and classifier pipeline_classifier = Pipeline([ # ('vect', self.vect), # ('tfidf', self.tfidf_transformer), ('clf', self.classifier) ]) #Perform grid search print "Performing grid search with classifier of instance ",str(self.classifier.__class__.__name__) self.grid = GridSearchCV(pipeline_classifier, self.grid_params, cv=cross_validation, refit=True, n_jobs=-1,verbose=1) self.grid.fit(combined_vector, self.train_targets) self.best_estimator = self.grid.best_estimator_ self.best_parameters = self.grid.best_params_ self.best_score = self.grid.best_score_ print "Results for ",self.classifier.__class__.__name__ print "Best params: ", self.best_parameters print "Best score: ", self.best_score print "Storing estimator... " utils.store_model(self.classifier.__class__.__name__, self.best_parameters, self.best_score) return self.grid def grid_search_on_text_features(self, cross_validate=True, file_postfix=""): """ Performs a grid search using text features on the given dataset. Stores the parameters for the optimal classifier. """ self.grid_params = { 'vect__ngram_range': [(1,1),(1,2),(2,2),(1,3),(2,3),(3,3),(1,4)], 'vect__use_idf': (True,False), 'vect__smooth_idf': (True, False), 'vect__sublinear_tf': (True, False), 'vect__max_df': (0.5,), } self.vect = TfidfVectorizer() cross_validation = StratifiedKFold(self.train_targets, n_folds=10) #Build a Pipeline with TFidfVectorizer and classifier pipeline_classifier = Pipeline([ ('vect', self.vect), ('clf', self.classifier)] ) #Perform grid search print "Performing grid search with classifier of instance ",str(self.classifier.__class__.__name__) self.grid = GridSearchCV(pipeline_classifier, self.grid_params, cv=cross_validation, refit=True, n_jobs=-1,verbose=1) self.grid.fit([t.text for t in self.train_tweets], self.train_targets) self.best_estimator = self.grid.best_estimator_ self.best_parameters = self.grid.best_params_ self.best_score = self.grid.best_score_ print "Results for ",self.classifier.__class__.__name__ print "Best params: ", self.best_parameters print "Best score: ", self.best_score print "Storing estimator... " utils.store_model(self.classifier.__class__.__name__, self.best_parameters, self.best_score, file_postfix=file_postfix) return self.grid def classify(self, tweets, sentimentvalues=None): """ Performs the classification process on list of tweets. """ if sentimentvalues!=None: self.test_words_and_values = sentimentvalues count_vector = self.vect.transform([t.text for t in tweets]) tfidf_count = self.tfidf_transformer.transform(count_vector) if self.only_text_features: combined_vector = tfidf_count else: dict_vector = self.dict_vectorizer.transform([features.get_feature_set(t, self.featureset, v) for t,v in zip(tweets, self.test_words_and_values)]) tfidf_dict = self.dict_transformer.transform(dict_vector) combined_vector = sp.hstack([tfidf_count, tfidf_dict]) predictions = self.best_estimator.predict(combined_vector) return predictions def classify_text(self, texts): """ Performs classification with only text features. """ count_vector = self.vect.transform([t for t in texts]) text_vector = self.tfidf_transformer.transform(count_vector) predictions = self.best_estimator.predict(text_vector) return predictions def test_and_return_results(self, test_tweets, test_targets, sentimentvalues): """ Tests the classifier on a given test set, and returns the accuracy, precision, recall, and f1 score. """ self.test_words_and_values = sentimentvalues predictions = self.classify(test_tweets) binary_predictions = utils.reduce_targets(predictions) binary_test_targets = utils.reduce_targets(test_targets) accuracy = metrics.accuracy_score(binary_test_targets, binary_predictions) precision = metrics.precision_score(binary_test_targets, binary_predictions) recall = metrics.recall_score(binary_test_targets, binary_predictions) f1_score = metrics.f1_score(binary_test_targets, binary_predictions) print "Scores: ", accuracy, precision, recall, f1_score return accuracy, precision, recall, f1_score def get_correctly_classified_tweets(self, tweets_and_sentiment): """ Classifies the given set of tweets and returns the ones that were correctly classified. """ tweets, sentimentvalues = zip(*tweets_and_sentiment) if sentimentvalues!=None: self.test_words_and_values = sentimentvalues count_vector = self.vect.transform([t.text for t in tweets]) tfidf_count = self.tfidf_transformer.transform(count_vector) if self.only_text_features: combined_vector = tfidf_count else: dict_vector = self.dict_vectorizer.transform([features.get_feature_set(t, self.featureset, v) for t,v in zip(tweets, self.test_words_and_values)]) tfidf_dict = self.dict_transformer.transform(dict_vector) combined_vector = sp.hstack([tfidf_count, tfidf_dict]) predictions = self.best_estimator.predict(combined_vector) tweets, targets = utils.make_subjectivity_targets(tweets) #return the tweets where the target match prediction correct_tweets = [] correct_sentimentvalues = [] for i in xrange(len(tweets)): if predictions[i]==targets[i]: correct_tweets.append(tweets[i]) correct_sentimentvalues.append(sentimentvalues[i]) return correct_tweets, correct_sentimentvalues def set_feature_set(self, featureset, sentimentvalues): """ Extracts and stores the given feature set for classification. """ self.featureset = featureset if featureset=='SA' or featureset=='PA': self.only_text_features=True self.feature_set = {} else: words_and_values = sentimentvalues self.feature_set = [features.get_feature_set(t, self.featureset, v) for t,v in zip(self.train_tweets,words_and_values)]
test_features = [id_features[id] for id in test_ids] labels = {'0': 0, '1': 1, '2': 2} train_labels = [labels[id_severity_train[id]] for id in train_ids] test_fake_labels = [train_labels[0]] * len(test_ids) vectorizer = DictVectorizer() X_train = vectorizer.fit_transform(train_features) features = vectorizer.get_feature_names() save_train_features = False if save_train_features: np.savetxt('x_train.txt', X_train.toarray(), delimiter=',', header=','.join(features)) X_test = vectorizer.transform(test_features) #scaler = prep.MinMaxScaler(feature_range=(0, 1), copy=True) scaler = prep.StandardScaler(copy=True, with_mean=True, with_std=True) X_train = scaler.fit_transform(X_train.toarray()) X_test = scaler.transform(X_test.toarray()) do_feature_elimination = False if do_feature_elimination: estimator = RandomForestClassifier(n_estimators=2000, criterion='entropy', max_depth=None, min_samples_split=16, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
train_ids = sorted(id_location_train.keys()) test_ids = sorted(id_location_test.keys()) train_features = [id_features[id] for id in train_ids] test_features = [id_features[id] for id in test_ids] labels = {'0':0, '1':1, '2':2} train_labels = [labels[id_severity_train[id]] for id in train_ids] test_fake_labels = [train_labels[0]] * len(test_ids) vectorizer = DictVectorizer() X_train = vectorizer.fit_transform(train_features) features = vectorizer.get_feature_names() save_train_features = False if save_train_features: np.savetxt('x_train.txt', X_train.toarray(), delimiter=',', header=','.join(features)) X_test = vectorizer.transform(test_features) #scaler = prep.MinMaxScaler(feature_range=(0, 1), copy=True) scaler = prep.StandardScaler(copy=True, with_mean=True, with_std=True) X_train = scaler.fit_transform(X_train.toarray()) X_test = scaler.transform(X_test.toarray()) do_feature_elimination = False if do_feature_elimination: estimator = RandomForestClassifier(n_estimators=2000, criterion='entropy', max_depth=None, min_samples_split=16, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=10, random_state=None, verbose=0, warm_start=False, class_weight=None) selector = RFECV(estimator, step=1, cv=5, scoring='log_loss') X_train = selector.fit_transform(X_train, train_labels) print 'after feature elimination', X_train.shape