def classify_lyrics_pos(genre_lyrics_map): vectorizer = DictVectorizer() all_lyrics_pos_tags = [] all_lyrics_genres = [] for genre in genre_lyrics_map.keys(): genre_lyrics = genre_lyrics_map[genre] for song_lyrics in genre_lyrics: pos_tags_map = song_lyrics["features"]["pos_tags_map"] all_lyrics_pos_tags.append(pos_tags_map) all_lyrics_genres.append(genre) pos_train, pos_test, genres_train, genres_test = train_test_split(all_lyrics_pos_tags, all_lyrics_genres, test_size=0.33) vectorizer.fit(all_lyrics_pos_tags) vect = vectorizer.transform((all_lyrics_pos_tags)) print("vect = " + str(vect)) classifiers_to_use = get_classifiers() partial_fit_classifiers = classifiers_to_use["partial"] full_fit_classifiers = classifiers_to_use["full"] teach_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, pos_train, genres_train, app_data.LYRICS_GENRES_METAL) test_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, pos_test, genres_test) print_top_features(partial_fit_classifiers + full_fit_classifiers, vectorizer, app_data.LYRICS_GENRES_METAL)
def _get_lyrics_vectorizers(): union = FeatureUnion( transformer_list=[ ('verse_count', Pipeline([ ('extractor', ct.LyricsVerseCountVectorizer()), ]) ), ('stanza_count', Pipeline([ ('extractor', ct.LyricsStanzaCountVectorizer()), ]) ), ('avg_verse_words', Pipeline([ ('extractor', ct.LyricsAvgVerseWordCountVectorizer()) ]) ), ('word_count', Pipeline([ ('extractor', ct.LyricsWordCountVectorizer()) ]) ), ('pos_tags_map', Pipeline([ ('extractor', ct.LyricsPartOfSpeechVectorizer()), ('vectorizer', DictVectorizer()) ]) ), ('word_endings', Pipeline([ ('extractor', ct.LyricsWordEndingsVectorizer()), ('vectorizer', DictVectorizer()), ('transformer', TfidfTransformer()) ]) ), ('lyrics_bow', Pipeline([ ('vectorizer', _get_tfidf_vectorizer()) ]) ) ], transformer_weights={ 'verse_count' : 2, 'stanza_count' : 2, 'avg_verse_words' : 3, 'word_count' : 3, 'pos_tags_map' : 10, 'word_endings' : 7, 'lyrics_bow' : 10 } ) return union
def __init__(self, prior=.5, sigma=.1, lam=1, basis='gauss', n_basis=200): self.clf = SparsePU_SL(prior=prior, sigma=sigma, lam=lam, basis=basis, n_basis=n_basis) # self.clf = PU_SL(prior=prior, sigma=sigma, lam=lam, basis=basis, n_basis=n_basis) # self.clf = SVC() self.featureizer = DictVectorizer(sparse=True)
def mutual_information_similarity(file_name): """ Calculates MI between all pairs of short_genre based on their word's MI. Prints to file the similarity :return: """ from sklearn.metrics.pairwise import cosine_similarity as cos_sim import math SimilarityScore = collections.namedtuple("SimilarityScore", ("g1", "g2", "score")) # a type # fetch all short genres mi_coll = MutualInformation() # all possible pairs of genre with no repeat genres = [] # calculate cosine similarity b/w pairs dv = DictVectorizer() def extract_bow_add_to_genres(genre, bow): if genre not in genres: genres.append(genre) new_bow = {} for k in bow.keys(): curr = bow[k] new_bow[k] = 0 if math.isnan(curr) or math.isinf(curr) else curr new_bow == 0 and print("Eliminated element") return new_bow bow_matrix = dv.fit_transform( extract_bow_add_to_genres(mi_obj.short_genre, mi_obj.bow) for mi_obj in mi_coll.iterable() ) print("Done with making vector") # sort the pairs by the cosine similarity score similarity_matrix = cos_sim(bow_matrix) print("Done with similarity calculation") sorted_list = [] # sort the similarity scores for x, y in itertools.combinations(range(0, len(genres)), 2): sorted_list.append(SimilarityScore(genres[x], genres[y], similarity_matrix[x][y])) # sort! sorted_list = sorted(sorted_list, key=operator.itemgetter(2), reverse=True) print("printing file") with open(file_name, mode="a", errors="ignore", encoding="latin-1") as file: for l in sorted_list: file.write("{}, {} value: {}\n".format(l[0], l[1], l[2]))
def __init__(self, tweet_file_path, no_of_clusters): """ The constructor reads csv file and builds the data matrix. """ self.np_extractor = ConllExtractor() self.pos_tagger = NLTKTagger() self.tweet_file_path = tweet_file_path self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path) self.vectorizer = DictVectorizer(sparse=True) self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters)
def transform_into_indicators(origin_dest_arr, params): print "Fitting the label encoder" keys= origin_dest_arr.keys() origins = map(lambda x: {'origin': x}, origin_dest_arr['origin'].as_matrix().astype(str)) dests = map(lambda x: {'dest': x}, origin_dest_arr['dest'].as_matrix().astype(str)) d = DictVectorizer(sort=False) reduce_fn = get_reduce_fn(keys) print "mapping" #cat = origin_dest_arr.to_dict(orient = 'records') cat = map(lambda x: x.__self__, origin_dest_arr.apply(reduce_fn, axis = 1, raw = True, reduce = True)) print "Transforming" return d.fit_transform(cat)
def load_vocab_vectorizer(train_set,pickle=True,extra_label="default"): train_dv=DictVectorizer() words=[dict(itertools.chain(*(train_set_obj.attr_map.items() for train_set_obj in train_set.objects())))] #fit the dv first train_dv.fit(words) print("vocab length is {}".format(len(train_dv.feature_names_))) del words pickle and pickle_dv(train_dv,extra_label) return train_dv
def classify_lyrics_mixed_features( genre_lyrics_map): vectorizer = DictVectorizer() all_lyrics_features = [] all_lyrics_genres = [] for genre in genre_lyrics_map.keys(): genre_lyrics = genre_lyrics_map[genre] for song_lyrics in genre_lyrics: features = song_lyrics["features"] song_features_map = {} for feature_name, feature_value in features.items(): # pos_tags_map is a dictionary - merge it with song_features_map dictionary if feature_name == "pos_tags_map": song_features_map.update(feature_value) # All other features are numeric, add their name and value as a new key-value pair to the song_features_map else: song_features_map[feature_name] = feature_value print("Features: " + str(song_features_map)) all_lyrics_features.append(song_features_map) all_lyrics_genres.append(genre) features_train, features_test, genres_train, genres_test = train_test_split(all_lyrics_features, all_lyrics_genres, test_size=0.33) vectorizer.fit(all_lyrics_features) classifiers_to_use = get_classifiers() partial_fit_classifiers = classifiers_to_use["partial"] full_fit_classifiers = classifiers_to_use["full"] teach_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, features_train, genres_train, app_data.LYRICS_GENRES_METAL) test_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, features_test, genres_test) print_top_features(partial_fit_classifiers + full_fit_classifiers, vectorizer, app_data.LYRICS_GENRES_METAL) print_classification_report(full_fit_classifiers[0], vectorizer, features_test, genres_test)
def cat_vectorize(train_data, test_data, num_cols): # categorical attributes cat_train_data = train_data.drop(num_cols, axis=1) cat_test_data = test_data.drop(num_cols, axis=1) cat_train_data.fillna('NA', inplace=True) cat_test_data.fillna('NA', inplace=True) cat_train_data_values = cat_train_data.T.to_dict().values() cat_test_data_values = cat_test_data.T.to_dict().values() # vectorize (encode as one hot) vectorizer = DictVectorizer(sparse=False) vec_train_data = vectorizer.fit_transform(cat_train_data_values) vec_test_data = vectorizer.transform(cat_test_data_values) return vec_train_data, vec_test_data
def _get_basic_features_transformator_union(): union = FeatureUnion( transformer_list=[ ('features_dict', Pipeline([ ('selector', LyricsFeatureSelector(key='features')), ('vectorizer', DictVectorizer()) ]) ), ('pos_tags_map', Pipeline([ ('selector', LyricsFeatureSelector(key='pos_tags_map')), ('vectorizer', DictVectorizer()) ]) ), ('word_endings', Pipeline([ ('selector', LyricsFeatureSelector(key='word_endings')), ('vectorizer', DictVectorizer()), ('transformer', TfidfTransformer()) ]) ), ('lyrics_bow', Pipeline([ ('selector', LyricsFeatureSelector(key='lyrics')), ('vectorizer', TfidfVectorizer(stop_words='english', max_df=0.6, analyzer='word')) ]) ) ], transformer_weights={ 'features_dict': 0.2, 'pos_tags_map' : 1.0, 'word_endings' : 0.7, 'lyrics_bow' : 0.4 } ) return union
def getCounts(examples): lengths = [len(examples[x]) for x in ("sets", "ids") ] + [examples[x].shape[0] for x in ("features", "labels")] assert len(set(lengths)) == 1, lengths data = {} print "Counting labels" counts = defaultdict(list) for sets, labels in zip(examples["sets"], examples["labels"]): numLabels = numpy.count_nonzero(labels) #category = ",".join(sets) categories = getCategories(sets) for category in categories: counts[category].append(numLabels) data["labels"] = counts print "Counting features" dv = DictVectorizer(sparse=True) dv.feature_names_ = examples["feature_names"] decoded = dv.inverse_transform(examples["features"]) for sets, features in zip(examples["sets"], decoded): categories = getCategories(sets) tags = [x.split(":")[0] for x in features.keys()] tagCounts = {x: tags.count(x) for x in set(tags)} for tag in tagCounts.keys(): if tag not in data: data[tag] = defaultdict(list) counts = data[tag] for category in categories: counts[category].append(tagCounts[tag]) # feature_indices = numpy.nonzero(features) # for i in feature_indices: # name = feature_names[i] # if ":" in name: # tag = "coverage_" + name.split(":")[0] # if tag not in data: # data[tag] = defaultdict(list) # counts = data[tag] # for category in sets: # counts[category] += 1 return dict(data)
class EntityDetectionPU(ClassifierMixin): def __init__(self, prior=.5, sigma=.1, lam=1, basis='gauss', n_basis=200): self.clf = SparsePU_SL(prior=prior, sigma=sigma, lam=lam, basis=basis, n_basis=n_basis) # self.clf = PU_SL(prior=prior, sigma=sigma, lam=lam, basis=basis, n_basis=n_basis) # self.clf = SVC() self.featureizer = DictVectorizer(sparse=True) def fit(self, X, y): x_feat = self.featureizer.fit_transform(X) self.clf.fit(x_feat, y) return self def predict(self, X): x_feat = self.featureizer.transform(X) return self.clf.predict(x_feat) def predict_sent(self, X_sent): for xi in X_sent: yield (self.predict([xi]))
def train_backoff_model(self, train): """\ Train a DummyClassifier back-off on the given training data. """ select_attr = [train.attribs[0].name] if train.attribs[0].name == self.class_attr: select_attr = [train.attribs[1].name] config = { 'class_attr': self.class_attr, 'select_attr': select_attr, 'vectorizer': DictVectorizer() } model = Model(config) model.train_on_data(train) return model
def _get_advanced_features_transformator_union(): union = FeatureUnion(transformer_list=[ # ('verse_count', Pipeline([ # ('selector', LyricsFeatureSelector(key='verse_count')), # ('transformer', TfidfVectorizer()), # ]) # ), # # ('stanza_count', Pipeline([ # ('selector', LyricsFeatureSelector(key='stanza_count')), # ('transformer', TfidfVectorizer()) # ]) # ), # # ('avg_verse_length', Pipeline([ # ('selector', LyricsFeatureSelector(key='avg_verse_length')), # ('transformer', TfidfVectorizer()) # ]) # ), ('pos_tags_map', Pipeline([ ('selector', LyricsFeatureSelector(key='pos_tags_map')), ('vectorizer', DictVectorizer()) # ('transformer', TfidfTransformer()) ]) ), ('lyrics_bow', Pipeline([ ('selector', LyricsFeatureSelector(key='lyrics')), ('vectorizer', CountVectorizer(max_features=10000)) # ('transformer', TfidfTransformer()) ]) ) ], transformer_weights={ # 'verse_count' : 0.5, # 'stanza_count' : 0.5, # 'avg_verse_length' : 0.8, 'pos_tags_map': 0.6, 'lyrics_bow' : 0.9 } ) return union
class KMeansEstimator: """ This class reads the tweets of users from a file and builds cluster centers on that data. It also provides method for finding the closest cluster center of unseen data. """ ADJECTIVE = 'JJ' """ Feature keys used in clustering... """ POLARITY_FEATURE_KEY = 'polarity' SUBJECTIVITY_FEATURE_KEY = 'subjectivity' TWEET_COUNT_FEATURE_KEY = 'tweetCount' """ Features not considered for clustering... """ USER_ID_FEATURE_KEY = 'userId' LONGITUDE_FEATURE_KEY = 'longitude' LATITUDE_FEATURE_KEY = 'latitude' """ Predicted label feature name. """ LABEL_FEATURE_KEY = 'label' RELEVENT_FEATURE_LIST = [ USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY, LABEL_FEATURE_KEY ] def __init__(self, tweet_file_path, no_of_clusters): """ The constructor reads csv file and builds the data matrix. """ self.np_extractor = ConllExtractor() self.pos_tagger = NLTKTagger() self.tweet_file_path = tweet_file_path self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path) self.vectorizer = DictVectorizer(sparse=True) self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters) @time_it def __get_data_matrix_from_file(self, tweet_file_path): """ Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list of all feature vectors. """ file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',') next(file_reader) data_matrix = [] for row in file_reader: logging.info("Extracting features for user_id:%s", row[0]) feature_vector = {} feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0]) feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2]) feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3]) feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4]) feature_vector.update( self.__get_features_from_tweet_text(row[1].decode('utf-8'))) data_matrix.append(feature_vector) logging.info("Successfully extracted features for user_id:%s", row[0]) return data_matrix @time_it def __get_features_from_tweet_text(self, tweet_text): """This function returns the following features from the tweet text: - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature. - Subjectivity and polarity as determined by TextBlob. :returns: (key,value) map of all features found. """ text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger) adjective_map = dict( Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE))) polarity = text_blob.sentiment[0] subjectivity = text_blob.sentiment[1] return dict( adjective_map.items() + { self.POLARITY_FEATURE_KEY: polarity, self.SUBJECTIVITY_FEATURE_KEY: subjectivity }.items()) @time_it def __get_clustering_data_matrix(self, data_matrix): """ This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from the data matrix and returns a copy of the data matrix. """ data_matrix_copy = copy.deepcopy(data_matrix) for feature_vector in data_matrix_copy: feature_vector.pop(self.USER_ID_FEATURE_KEY) feature_vector.pop(self.LATITUDE_FEATURE_KEY) feature_vector.pop(self.LONGITUDE_FEATURE_KEY) return data_matrix_copy @time_it def perform_clustering(self, features_to_include=None): """ This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at "tweet_file_path". It returns list of feature vector, where each feature vector contains only "features_to_include" or all features if "features_to_include" is None. """ clustering_data_matrix = self.__get_clustering_data_matrix( self.data_matrix) transformed_data_matrix = self.vectorizer.fit_transform( clustering_data_matrix) self.k_means_estimator.fit(transformed_data_matrix, y=None) return self.__get_predicted_labels(self.data_matrix, features_to_include) @time_it def __get_predicted_labels(self, data_matrix, features_to_include): """ Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The data matrix is modified in place. It returns a new copy of data_matrix with "features_to_include" features. """ feature_names = self.vectorizer.get_feature_names() for feature_vector in data_matrix: row = [0] * len(feature_names) column = range(len(feature_names)) data = map( lambda feature_name: feature_vector[feature_name] if feature_name in feature_vector else 0, feature_names) feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column)))) predicted_label = self.k_means_estimator.predict( feature_csr_matrix) feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0] expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix) if features_to_include: return self.__get_filtered_data_matrix(expanded_data_matrix, features_to_include) else: return expanded_data_matrix @time_it def __get_filtered_data_matrix(self, data_matrix, features_to_include): """ Removes all features except features_to_include """ filtered_data_matrix = [] for feature_vector in data_matrix: filtered_feature_vector = {} for feature_name in features_to_include: filtered_feature_vector[feature_name] = feature_vector[ feature_name] filtered_data_matrix.append(filtered_feature_vector) return filtered_data_matrix @time_it def __get_expanded_data_matrix(self, data_matrix): """ Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new modified copy is returned. """ feature_names = self.vectorizer.get_feature_names() expanded_data_matrix = copy.deepcopy(data_matrix) for feature_vector in expanded_data_matrix: for feature_name in feature_names: if feature_name not in feature_vector: feature_vector[feature_name] = 0 return expanded_data_matrix @time_it def predict_labels_for_data(self, file_path, features_to_include=None): """ This function reads the tweets of different users from the file at file_path and assigns the closest cluster center to each user. It returns list of tuples of (user_id,predicted_label,latitude, longitude). """ data_matrix = self.__get_data_matrix_from_file(file_path) return self.__get_predicted_labels(data_matrix, features_to_include)
combination_features.update(combs_i_dict) id_features[id].update(combination_features) if add_location: id_features[id][id_location[id]] = '1' id_features[id]['feature_count'] = float(id_feature_count[id]) id_features[id]['event_count'] = id_event_count[id] id_features[id]['resource_count'] = id_resource_count[id] train_ids = sorted(id_location_train.keys()) test_ids = sorted(id_location_test.keys()) train_features = [id_features[id] for id in train_ids] test_features = [id_features[id] for id in test_ids] labels = {'0': 0, '1': 1, '2': 2} train_labels = [labels[id_severity_train[id]] for id in train_ids] test_fake_labels = [train_labels[0]] * len(test_ids) vectorizer = DictVectorizer() X_train = vectorizer.fit_transform(train_features) features = vectorizer.get_feature_names() save_train_features = False if save_train_features: np.savetxt('x_train.txt', X_train.toarray(), delimiter=',', header=','.join(features)) X_test = vectorizer.transform(test_features) #scaler = prep.MinMaxScaler(feature_range=(0, 1), copy=True) scaler = prep.StandardScaler(copy=True, with_mean=True, with_std=True) X_train = scaler.fit_transform(X_train.toarray())
def vectorize(data): transformer = DictVectorizer() values= flatten([make_features(x) for x in data]) X = transformer.fit_transform([x["x"] for x in values]).toarray() Y = array([x["y"] for x in values]) return (X, Y, values)
def vectorize(data): transformer = DictVectorizer() values = flatten([make_features(x) for x in data]) X = transformer.fit_transform([x["x"] for x in values]).toarray() Y = array([x["y"] for x in values]) return (X, Y, values)
# This controls which features of the input ARFF file will be actually # used in training (do not select the target class feature!) 'select_attr': [ 'Lemma', 'LemmaSuff_1', 'LemmaSuff_2', 'LemmaSuff_3', 'LemmaSuff_4', 'LemmaSuff_5', 'LemmaSuff_6', 'LemmaSuff_7', 'LemmaSuff_8', 'Tag_POS', 'Tag_SubPOS', 'Tag_Gen', 'Tag_Num', 'Tag_Cas', 'Tag_PGe', 'Tag_PNu', 'Tag_Per', 'Tag_Ten', 'Tag_Gra', 'Tag_Neg', 'Tag_Voi', 'Tag_Var' ], # This filters out some feature values (here 'Tag_*' values equal to '.' or '-'. # You can use an arbitrary lambda function here (or None if you don't want it). 'filter_attr': lambda key, val: False if key.startswith('Tag') and val in ['.', '-'] else True, 'vectorizer': DictVectorizer(), # Feature filtering using ANOVA (recommended) 'feature_filter': SelectPercentile(percentile=10), # You can use any Scikit-Learn classifier here 'classifier_class': LogisticRegression, # Classifier parameter settings (see Scikit-Learn documentation for the list of parameters). # If you use lists instead of single values and specify the unfold_pattern, all the values # in the lists will be tried in parallel on a cluster using qsub). # Do not use lists of values and the unfold_pattern setting if you don't have access to # cluster/qsub. 'classifier_params': {
class KMeansEstimator: """ This class reads the tweets of users from a file and builds cluster centers on that data. It also provides method for finding the closest cluster center of unseen data. """ ADJECTIVE = 'JJ' """ Feature keys used in clustering... """ POLARITY_FEATURE_KEY = 'polarity' SUBJECTIVITY_FEATURE_KEY = 'subjectivity' TWEET_COUNT_FEATURE_KEY = 'tweetCount' """ Features not considered for clustering... """ USER_ID_FEATURE_KEY = 'userId' LONGITUDE_FEATURE_KEY = 'longitude' LATITUDE_FEATURE_KEY = 'latitude' """ Predicted label feature name. """ LABEL_FEATURE_KEY = 'label' RELEVENT_FEATURE_LIST = [USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY, LABEL_FEATURE_KEY] def __init__(self, tweet_file_path, no_of_clusters): """ The constructor reads csv file and builds the data matrix. """ self.np_extractor = ConllExtractor() self.pos_tagger = NLTKTagger() self.tweet_file_path = tweet_file_path self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path) self.vectorizer = DictVectorizer(sparse=True) self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters) @time_it def __get_data_matrix_from_file(self, tweet_file_path): """ Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list of all feature vectors. """ file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',') next(file_reader) data_matrix = [] for row in file_reader: logging.info("Extracting features for user_id:%s", row[0]) feature_vector = {} feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0]) feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2]) feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3]) feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4]) feature_vector.update(self.__get_features_from_tweet_text(row[1].decode('utf-8'))) data_matrix.append(feature_vector) logging.info("Successfully extracted features for user_id:%s", row[0]) return data_matrix @time_it def __get_features_from_tweet_text(self, tweet_text): """This function returns the following features from the tweet text: - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature. - Subjectivity and polarity as determined by TextBlob. :returns: (key,value) map of all features found. """ text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger); adjective_map = dict(Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE))) polarity = text_blob.sentiment[0] subjectivity = text_blob.sentiment[1] return dict(adjective_map.items() + {self.POLARITY_FEATURE_KEY:polarity, self.SUBJECTIVITY_FEATURE_KEY:subjectivity}.items()) @time_it def __get_clustering_data_matrix(self, data_matrix): """ This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from the data matrix and returns a copy of the data matrix. """ data_matrix_copy = copy.deepcopy(data_matrix) for feature_vector in data_matrix_copy: feature_vector.pop(self.USER_ID_FEATURE_KEY) feature_vector.pop(self.LATITUDE_FEATURE_KEY) feature_vector.pop(self.LONGITUDE_FEATURE_KEY) return data_matrix_copy @time_it def perform_clustering(self, features_to_include=None): """ This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at "tweet_file_path". It returns list of feature vector, where each feature vector contains only "features_to_include" or all features if "features_to_include" is None. """ clustering_data_matrix = self.__get_clustering_data_matrix(self.data_matrix) transformed_data_matrix = self.vectorizer.fit_transform(clustering_data_matrix) self.k_means_estimator.fit(transformed_data_matrix, y=None) return self.__get_predicted_labels(self.data_matrix, features_to_include) @time_it def __get_predicted_labels(self, data_matrix, features_to_include): """ Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The data matrix is modified in place. It returns a new copy of data_matrix with "features_to_include" features. """ feature_names = self.vectorizer.get_feature_names() for feature_vector in data_matrix: row = [0] * len(feature_names) column = range(len(feature_names)) data = map(lambda feature_name:feature_vector[feature_name] if feature_name in feature_vector else 0, feature_names) feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column)))) predicted_label = self.k_means_estimator.predict(feature_csr_matrix) feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0] expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix) if features_to_include: return self.__get_filtered_data_matrix(expanded_data_matrix, features_to_include) else: return expanded_data_matrix @time_it def __get_filtered_data_matrix(self, data_matrix, features_to_include): """ Removes all features except features_to_include """ filtered_data_matrix = [] for feature_vector in data_matrix: filtered_feature_vector = {} for feature_name in features_to_include: filtered_feature_vector[feature_name] = feature_vector[feature_name] filtered_data_matrix.append(filtered_feature_vector) return filtered_data_matrix @time_it def __get_expanded_data_matrix(self, data_matrix): """ Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new modified copy is returned. """ feature_names = self.vectorizer.get_feature_names() expanded_data_matrix = copy.deepcopy(data_matrix) for feature_vector in expanded_data_matrix: for feature_name in feature_names: if feature_name not in feature_vector: feature_vector[feature_name] = 0 return expanded_data_matrix @time_it def predict_labels_for_data(self, file_path, features_to_include=None): """ This function reads the tweets of different users from the file at file_path and assigns the closest cluster center to each user. It returns list of tuples of (user_id,predicted_label,latitude, longitude). """ data_matrix = self.__get_data_matrix_from_file(file_path) return self.__get_predicted_labels(data_matrix, features_to_include)
y_test = y_test.astype(int) # categorical attributes cat_train = X_train.drop(numeric_cols, axis=1) cat_test = X_test.drop(numeric_cols, axis=1) cat_train.fillna('NA', inplace=True) cat_test.fillna('NA', inplace=True) x_cat_train = cat_train.T.to_dict().values() x_cat_test = cat_test.T.to_dict().values() # vectorize (encode as one hot) vectorizer = DictVectorizer(sparse=False) vec_x_cat_train = vectorizer.fit_transform(x_cat_train) vec_x_cat_test = vectorizer.transform(x_cat_test) # build the feature vector x_train = np.hstack((x_num_train, vec_x_cat_train)) x_test = np.hstack((x_num_test, vec_x_cat_test)) #clfLR = LogisticRegression().fit(x_train, y_train.values) #pred = clfLR.predict(x_test) #print classification_report(y_test.values, pred, digits=4) #print accuracy_score(y_test.values, pred) clfTree = tree.DecisionTreeClassifier().fit(x_train, y_train)
def get_data_social(self, ids) : ''' Read the social features from the database. ''' # data = self.get_social_features(ids) # First get some aggregated values boards_info = self.get_boards_info() repinned_info = self.get_repinned_info() uncateg, categ_entropy = self.get_users_categories_features() query = """SELECT p.id as pin_id, u.id as user_id, p.nComments as comments, p.category as category, p.description as description, p.isRepin as is_repin, p.date as date, u.gender as gender, u.nFollowers as followers, u.nFollowing as following, u.nPins as pins, u.nBoards as boards, (u.website != "null") as has_website, p.board_id as board_id FROM pins p JOIN users u ON p.user_id = u.id""" # Make query, get results and represent as map {pin_id: data} for quick access c = self.db.cursor() c.execute(query) rows_map = {row[0]: row[1:] for row in c.fetchall()} c.close() # Store concepts as a dict per row (pin) data = [] for pin_id in ids: (user_id, ncomments, categ, desc, is_repin, date, gender, nfollowers, nfollowing, npins, nboards, has_web, board_id) = rows_map[pin_id] f = {} # Convert to string to emphasize that this feature is categorical # f["ncomments"] = ncomments f["category"] = categ f["description_len"] = len(desc) f["is_repin"] = is_repin f["gender"] = gender # f["user_followers"] = nfollowers f["user_following"] = nfollowing f["users_pins"] = npins f["users_boards"] = nboards f["has_website"] = has_web f["is_product"] = (1 if '$' in desc else 0) f["day_of_the_week"] = (date.strftime("%a") if (date) else "") if nfollowers == 0 : nfollowers = 1 # f["follow_ratio"] = float(nfollowing)/nfollowers board_pins, board_followers = boards_info[board_id] f["board_pins"] = board_pins # Total pins of the board # f["board_followers"] = board_followers # Total followers of the board f["category_entropy"] = categ_entropy[user_id] f["uncategorized"] = uncateg[user_id] f["repinned"] = repinned_info[user_id] data.append(f) # data = data[0:4,:] # Convert categorical features to numerical representation vec = DictVectorizer() data = vec.fit_transform(data).toarray() return vec.get_feature_names(), data
class Model(object): """ Class for abstracting the different classification models. """ def __init__(self, train_tweets, train_targets, vect_options, tfidf_options, extra_params): self.grid_params = { # 'vect__ngram_range': [(1,1),(1,2),(2,2)], # 'tfidf__use_idf': (True,False), # 'tfidf__smooth_idf': (True, False), # 'tfidf__sublinear_tf': (True, False), } self.grid_params = dict(self.grid_params.items()+extra_params.items()) self.vect_options = vect_options self.tfidf_options = tfidf_options self.feature_set = {} self.train_tweets = train_tweets self.train_targets = train_targets self.only_text_features = False def train_on_feature_set(self, cross_validate=True, use_tfidf=True): """ Performs training with the given model using the given feature set """ #Establish document text feature vectors print "Vectorizing" # self.tokenizer = CountVectorizer().build_tokenizer() self.vect = CountVectorizer(**self.vect_options) self.tfidf_transformer = TfidfTransformer(**self.tfidf_options) self.dict_transformer = TfidfTransformer(**self.tfidf_options) # train_counts_tf = tfidf_transformer.fit_transform(train_counts) count_vector = self.vect.fit_transform([t.text for t in self.train_tweets]) tfidf_count = self.tfidf_transformer.fit_transform(count_vector) if self.only_text_features: combined_vector = tfidf_count else: self.dict_vectorizer = DictVectorizer() dict_vector = self.dict_vectorizer.fit_transform(self.feature_set) f=codecs.open("feature_set.txt", "w", "utf8") for d in dict_vector: f.write(d.__str__()) f.close() tfidf_dict = self.dict_transformer.fit_transform(dict_vector) f=codecs.open("feature_set_tdidf.txt", "w", "utf8") for d in tfidf_dict: f.write(d.__str__()) f.close() combined_vector = sp.hstack([tfidf_count, tfidf_dict]) # combined_features = FeatureUnion() #Crossvalidation cross_validation = StratifiedKFold(self.train_targets, n_folds=10) #Build a Pipeline with TFidfVectorizer and classifier pipeline_classifier = Pipeline([ # ('vect', self.vect), # ('tfidf', self.tfidf_transformer), ('clf', self.classifier) ]) #Perform grid search print "Performing grid search with classifier of instance ",str(self.classifier.__class__.__name__) self.grid = GridSearchCV(pipeline_classifier, self.grid_params, cv=cross_validation, refit=True, n_jobs=-1,verbose=1) self.grid.fit(combined_vector, self.train_targets) self.best_estimator = self.grid.best_estimator_ self.best_parameters = self.grid.best_params_ self.best_score = self.grid.best_score_ print "Results for ",self.classifier.__class__.__name__ print "Best params: ", self.best_parameters print "Best score: ", self.best_score print "Storing estimator... " utils.store_model(self.classifier.__class__.__name__, self.best_parameters, self.best_score) return self.grid def grid_search_on_text_features(self, cross_validate=True, file_postfix=""): """ Performs a grid search using text features on the given dataset. Stores the parameters for the optimal classifier. """ self.grid_params = { 'vect__ngram_range': [(1,1),(1,2),(2,2),(1,3),(2,3),(3,3),(1,4)], 'vect__use_idf': (True,False), 'vect__smooth_idf': (True, False), 'vect__sublinear_tf': (True, False), 'vect__max_df': (0.5,), } self.vect = TfidfVectorizer() cross_validation = StratifiedKFold(self.train_targets, n_folds=10) #Build a Pipeline with TFidfVectorizer and classifier pipeline_classifier = Pipeline([ ('vect', self.vect), ('clf', self.classifier)] ) #Perform grid search print "Performing grid search with classifier of instance ",str(self.classifier.__class__.__name__) self.grid = GridSearchCV(pipeline_classifier, self.grid_params, cv=cross_validation, refit=True, n_jobs=-1,verbose=1) self.grid.fit([t.text for t in self.train_tweets], self.train_targets) self.best_estimator = self.grid.best_estimator_ self.best_parameters = self.grid.best_params_ self.best_score = self.grid.best_score_ print "Results for ",self.classifier.__class__.__name__ print "Best params: ", self.best_parameters print "Best score: ", self.best_score print "Storing estimator... " utils.store_model(self.classifier.__class__.__name__, self.best_parameters, self.best_score, file_postfix=file_postfix) return self.grid def classify(self, tweets, sentimentvalues=None): """ Performs the classification process on list of tweets. """ if sentimentvalues!=None: self.test_words_and_values = sentimentvalues count_vector = self.vect.transform([t.text for t in tweets]) tfidf_count = self.tfidf_transformer.transform(count_vector) if self.only_text_features: combined_vector = tfidf_count else: dict_vector = self.dict_vectorizer.transform([features.get_feature_set(t, self.featureset, v) for t,v in zip(tweets, self.test_words_and_values)]) tfidf_dict = self.dict_transformer.transform(dict_vector) combined_vector = sp.hstack([tfidf_count, tfidf_dict]) predictions = self.best_estimator.predict(combined_vector) return predictions def classify_text(self, texts): """ Performs classification with only text features. """ count_vector = self.vect.transform([t for t in texts]) text_vector = self.tfidf_transformer.transform(count_vector) predictions = self.best_estimator.predict(text_vector) return predictions def test_and_return_results(self, test_tweets, test_targets, sentimentvalues): """ Tests the classifier on a given test set, and returns the accuracy, precision, recall, and f1 score. """ self.test_words_and_values = sentimentvalues predictions = self.classify(test_tweets) binary_predictions = utils.reduce_targets(predictions) binary_test_targets = utils.reduce_targets(test_targets) accuracy = metrics.accuracy_score(binary_test_targets, binary_predictions) precision = metrics.precision_score(binary_test_targets, binary_predictions) recall = metrics.recall_score(binary_test_targets, binary_predictions) f1_score = metrics.f1_score(binary_test_targets, binary_predictions) print "Scores: ", accuracy, precision, recall, f1_score return accuracy, precision, recall, f1_score def get_correctly_classified_tweets(self, tweets_and_sentiment): """ Classifies the given set of tweets and returns the ones that were correctly classified. """ tweets, sentimentvalues = zip(*tweets_and_sentiment) if sentimentvalues!=None: self.test_words_and_values = sentimentvalues count_vector = self.vect.transform([t.text for t in tweets]) tfidf_count = self.tfidf_transformer.transform(count_vector) if self.only_text_features: combined_vector = tfidf_count else: dict_vector = self.dict_vectorizer.transform([features.get_feature_set(t, self.featureset, v) for t,v in zip(tweets, self.test_words_and_values)]) tfidf_dict = self.dict_transformer.transform(dict_vector) combined_vector = sp.hstack([tfidf_count, tfidf_dict]) predictions = self.best_estimator.predict(combined_vector) tweets, targets = utils.make_subjectivity_targets(tweets) #return the tweets where the target match prediction correct_tweets = [] correct_sentimentvalues = [] for i in xrange(len(tweets)): if predictions[i]==targets[i]: correct_tweets.append(tweets[i]) correct_sentimentvalues.append(sentimentvalues[i]) return correct_tweets, correct_sentimentvalues def set_feature_set(self, featureset, sentimentvalues): """ Extracts and stores the given feature set for classification. """ self.featureset = featureset if featureset=='SA' or featureset=='PA': self.only_text_features=True self.feature_set = {} else: words_and_values = sentimentvalues self.feature_set = [features.get_feature_set(t, self.featureset, v) for t,v in zip(self.train_tweets,words_and_values)]
def train_on_feature_set(self, cross_validate=True, use_tfidf=True): """ Performs training with the given model using the given feature set """ #Establish document text feature vectors print "Vectorizing" # self.tokenizer = CountVectorizer().build_tokenizer() self.vect = CountVectorizer(**self.vect_options) self.tfidf_transformer = TfidfTransformer(**self.tfidf_options) self.dict_transformer = TfidfTransformer(**self.tfidf_options) # train_counts_tf = tfidf_transformer.fit_transform(train_counts) count_vector = self.vect.fit_transform([t.text for t in self.train_tweets]) tfidf_count = self.tfidf_transformer.fit_transform(count_vector) if self.only_text_features: combined_vector = tfidf_count else: self.dict_vectorizer = DictVectorizer() dict_vector = self.dict_vectorizer.fit_transform(self.feature_set) f=codecs.open("feature_set.txt", "w", "utf8") for d in dict_vector: f.write(d.__str__()) f.close() tfidf_dict = self.dict_transformer.fit_transform(dict_vector) f=codecs.open("feature_set_tdidf.txt", "w", "utf8") for d in tfidf_dict: f.write(d.__str__()) f.close() combined_vector = sp.hstack([tfidf_count, tfidf_dict]) # combined_features = FeatureUnion() #Crossvalidation cross_validation = StratifiedKFold(self.train_targets, n_folds=10) #Build a Pipeline with TFidfVectorizer and classifier pipeline_classifier = Pipeline([ # ('vect', self.vect), # ('tfidf', self.tfidf_transformer), ('clf', self.classifier) ]) #Perform grid search print "Performing grid search with classifier of instance ",str(self.classifier.__class__.__name__) self.grid = GridSearchCV(pipeline_classifier, self.grid_params, cv=cross_validation, refit=True, n_jobs=-1,verbose=1) self.grid.fit(combined_vector, self.train_targets) self.best_estimator = self.grid.best_estimator_ self.best_parameters = self.grid.best_params_ self.best_score = self.grid.best_score_ print "Results for ",self.classifier.__class__.__name__ print "Best params: ", self.best_parameters print "Best score: ", self.best_score print "Storing estimator... " utils.store_model(self.classifier.__class__.__name__, self.best_parameters, self.best_score) return self.grid
id_features[id].update(combination_features) if add_location: id_features[id][id_location[id]] = '1' id_features[id]['feature_count'] = float(id_feature_count[id]) id_features[id]['event_count'] = id_event_count[id] id_features[id]['resource_count'] = id_resource_count[id] train_ids = sorted(id_location_train.keys()) test_ids = sorted(id_location_test.keys()) train_features = [id_features[id] for id in train_ids] test_features = [id_features[id] for id in test_ids] labels = {'0':0, '1':1, '2':2} train_labels = [labels[id_severity_train[id]] for id in train_ids] test_fake_labels = [train_labels[0]] * len(test_ids) vectorizer = DictVectorizer() X_train = vectorizer.fit_transform(train_features) features = vectorizer.get_feature_names() save_train_features = False if save_train_features: np.savetxt('x_train.txt', X_train.toarray(), delimiter=',', header=','.join(features)) X_test = vectorizer.transform(test_features) #scaler = prep.MinMaxScaler(feature_range=(0, 1), copy=True) scaler = prep.StandardScaler(copy=True, with_mean=True, with_std=True) X_train = scaler.fit_transform(X_train.toarray()) X_test = scaler.transform(X_test.toarray()) do_feature_elimination = False