示例#1
0
def mutual_information_similarity(file_name):
    """
    Calculates MI between all pairs of short_genre based on their word's MI.

    Prints to file the similarity

    :return:
    """
    from sklearn.metrics.pairwise import cosine_similarity as cos_sim
    import math

    SimilarityScore = collections.namedtuple("SimilarityScore", ("g1", "g2", "score"))  # a type

    # fetch all short genres
    mi_coll = MutualInformation()
    # all possible pairs of genre with no repeat
    genres = []

    # calculate cosine similarity b/w pairs
    dv = DictVectorizer()

    def extract_bow_add_to_genres(genre, bow):
        if genre not in genres:
            genres.append(genre)

        new_bow = {}

        for k in bow.keys():

            curr = bow[k]
            new_bow[k] = 0 if math.isnan(curr) or math.isinf(curr) else curr

            new_bow == 0 and print("Eliminated element")

        return new_bow

    bow_matrix = dv.fit_transform(
        extract_bow_add_to_genres(mi_obj.short_genre, mi_obj.bow) for mi_obj in mi_coll.iterable()
    )

    print("Done with making vector")
    # sort the pairs by the cosine similarity score
    similarity_matrix = cos_sim(bow_matrix)

    print("Done with similarity calculation")
    sorted_list = []
    # sort the similarity scores
    for x, y in itertools.combinations(range(0, len(genres)), 2):
        sorted_list.append(SimilarityScore(genres[x], genres[y], similarity_matrix[x][y]))
    # sort!
    sorted_list = sorted(sorted_list, key=operator.itemgetter(2), reverse=True)

    print("printing file")
    with open(file_name, mode="a", errors="ignore", encoding="latin-1") as file:
        for l in sorted_list:
            file.write("{}, {} value: {}\n".format(l[0], l[1], l[2]))
def transform_into_indicators(origin_dest_arr, params):
    print "Fitting the label encoder"
    keys=  origin_dest_arr.keys()
    origins = map(lambda x: {'origin': x}, origin_dest_arr['origin'].as_matrix().astype(str))
    dests = map(lambda x: {'dest': x}, origin_dest_arr['dest'].as_matrix().astype(str))
    d = DictVectorizer(sort=False)
    reduce_fn = get_reduce_fn(keys)
    print "mapping"
    #cat = origin_dest_arr.to_dict(orient = 'records')
    cat = map(lambda x: x.__self__, origin_dest_arr.apply(reduce_fn, axis = 1, raw = True, reduce = True))
    print "Transforming"
    return d.fit_transform(cat)
示例#3
0
def cat_vectorize(train_data, test_data, num_cols):
    # categorical attributes
    cat_train_data = train_data.drop(num_cols, axis=1)
    cat_test_data = test_data.drop(num_cols, axis=1)

    cat_train_data.fillna('NA', inplace=True)
    cat_test_data.fillna('NA', inplace=True)

    cat_train_data_values = cat_train_data.T.to_dict().values()
    cat_test_data_values = cat_test_data.T.to_dict().values()

    # vectorize (encode as one hot)
    vectorizer = DictVectorizer(sparse=False)
    vec_train_data = vectorizer.fit_transform(cat_train_data_values)
    vec_test_data = vectorizer.transform(cat_test_data_values)

    return vec_train_data, vec_test_data
示例#4
0
class EntityDetectionPU(ClassifierMixin):
    def __init__(self, prior=.5, sigma=.1, lam=1, basis='gauss', n_basis=200):
        self.clf = SparsePU_SL(prior=prior,
                               sigma=sigma,
                               lam=lam,
                               basis=basis,
                               n_basis=n_basis)
        # self.clf = PU_SL(prior=prior, sigma=sigma, lam=lam, basis=basis, n_basis=n_basis)
        # self.clf = SVC()
        self.featureizer = DictVectorizer(sparse=True)

    def fit(self, X, y):
        x_feat = self.featureizer.fit_transform(X)
        self.clf.fit(x_feat, y)
        return self

    def predict(self, X):
        x_feat = self.featureizer.transform(X)
        return self.clf.predict(x_feat)

    def predict_sent(self, X_sent):
        for xi in X_sent:
            yield (self.predict([xi]))
示例#5
0
	def get_data_social(self, ids) :
		'''
		Read the social features from the database.
		'''
#		data = self.get_social_features(ids)
		
		# First get some aggregated values
		boards_info = self.get_boards_info()
		repinned_info = self.get_repinned_info()
		
		uncateg, categ_entropy =  self.get_users_categories_features()

		query = """SELECT p.id as pin_id, 
							 u.id as user_id, 
							 p.nComments as comments, 
							 p.category as category, 
							 p.description as description, 
							 p.isRepin as is_repin,
							 p.date as date,
							 u.gender as gender, 
							 u.nFollowers as followers, 
							 u.nFollowing as following, 
							 u.nPins as pins,
							 u.nBoards as boards,
							 (u.website != "null") as has_website, 
							 p.board_id as board_id
							 FROM pins p JOIN users u ON p.user_id = u.id"""

		# Make query, get results and represent as map {pin_id: data} for quick access
		c = self.db.cursor()
		c.execute(query)
		rows_map = {row[0]: row[1:] for row in c.fetchall()}
		c.close()

		# Store concepts as a dict per row (pin) 
		data = [] 
		for pin_id in ids:

			(user_id, ncomments, categ, desc, is_repin, date, gender, nfollowers, nfollowing, npins, nboards, has_web, board_id) = rows_map[pin_id]

			f = {}

			# Convert to string to emphasize that this feature is categorical
#			f["ncomments"] = ncomments
			f["category"] = categ
			f["description_len"] = len(desc)
			f["is_repin"] = is_repin
			f["gender"] = gender
#			f["user_followers"] = nfollowers
			f["user_following"] = nfollowing
			f["users_pins"] = npins
			f["users_boards"] = nboards
			f["has_website"] = has_web

			f["is_product"] = (1 if '$' in desc else 0)
			f["day_of_the_week"] = (date.strftime("%a") if (date) else "")

			if nfollowers == 0 : 
				nfollowers = 1

#			f["follow_ratio"] = float(nfollowing)/nfollowers

			board_pins, board_followers = boards_info[board_id]
			f["board_pins"] = board_pins            # Total pins of the board
#			f["board_followers"] = board_followers  # Total followers of the board

			f["category_entropy"] = categ_entropy[user_id]
			f["uncategorized"] = uncateg[user_id]
			f["repinned"] = repinned_info[user_id]

			data.append(f)
			
	# 	data = data[0:4,:]
	
		# Convert categorical features to numerical representation 
		vec = DictVectorizer()
		data = vec.fit_transform(data).toarray()
		return vec.get_feature_names(), data
示例#6
0
class KMeansEstimator:
    """
    This class reads the tweets of users from a file and builds cluster centers on that data. It also provides
    method for finding the closest cluster center of unseen data.
    """

    ADJECTIVE = 'JJ'
    """
    Feature keys used in clustering...
    """
    POLARITY_FEATURE_KEY = 'polarity'
    SUBJECTIVITY_FEATURE_KEY = 'subjectivity'
    TWEET_COUNT_FEATURE_KEY = 'tweetCount'
    """
    Features not considered for clustering...
    """
    USER_ID_FEATURE_KEY = 'userId'
    LONGITUDE_FEATURE_KEY = 'longitude'
    LATITUDE_FEATURE_KEY = 'latitude'
    """
    Predicted label feature name.
    """
    LABEL_FEATURE_KEY = 'label'

    RELEVENT_FEATURE_LIST = [
        USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY,
        LABEL_FEATURE_KEY
    ]

    def __init__(self, tweet_file_path, no_of_clusters):
        """
        The constructor reads csv file and builds the data matrix.
        """
        self.np_extractor = ConllExtractor()
        self.pos_tagger = NLTKTagger()
        self.tweet_file_path = tweet_file_path
        self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path)
        self.vectorizer = DictVectorizer(sparse=True)
        self.k_means_estimator = KMeans(init="random",
                                        n_clusters=no_of_clusters)

    @time_it
    def __get_data_matrix_from_file(self, tweet_file_path):
        """
        Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list
        of all feature vectors.
        """
        file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',')
        next(file_reader)
        data_matrix = []
        for row in file_reader:
            logging.info("Extracting features for user_id:%s", row[0])
            feature_vector = {}
            feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0])
            feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2])
            feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3])
            feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4])
            feature_vector.update(
                self.__get_features_from_tweet_text(row[1].decode('utf-8')))
            data_matrix.append(feature_vector)
            logging.info("Successfully extracted features for user_id:%s",
                         row[0])
        return data_matrix

    @time_it
    def __get_features_from_tweet_text(self, tweet_text):
        """This function returns the following features from the tweet text:
        - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature.
        - Subjectivity and polarity as determined by TextBlob.
        :returns:  (key,value) map of all features found. 
        """
        text_blob = TextBlob(tweet_text,
                             np_extractor=self.np_extractor,
                             pos_tagger=self.pos_tagger)
        adjective_map = dict(
            Counter((ele[0] for ele in set(text_blob.pos_tags)
                     if ele[1] == self.ADJECTIVE)))
        polarity = text_blob.sentiment[0]
        subjectivity = text_blob.sentiment[1]
        return dict(
            adjective_map.items() + {
                self.POLARITY_FEATURE_KEY: polarity,
                self.SUBJECTIVITY_FEATURE_KEY: subjectivity
            }.items())

    @time_it
    def __get_clustering_data_matrix(self, data_matrix):
        """
        This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from
        the data matrix and returns a copy of the data matrix.
        """
        data_matrix_copy = copy.deepcopy(data_matrix)
        for feature_vector in data_matrix_copy:
            feature_vector.pop(self.USER_ID_FEATURE_KEY)
            feature_vector.pop(self.LATITUDE_FEATURE_KEY)
            feature_vector.pop(self.LONGITUDE_FEATURE_KEY)
        return data_matrix_copy

    @time_it
    def perform_clustering(self, features_to_include=None):
        """
        This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at
        "tweet_file_path".
        It returns list of feature vector, where each feature vector contains only "features_to_include" or all features
        if "features_to_include" is None.
        """
        clustering_data_matrix = self.__get_clustering_data_matrix(
            self.data_matrix)
        transformed_data_matrix = self.vectorizer.fit_transform(
            clustering_data_matrix)

        self.k_means_estimator.fit(transformed_data_matrix, y=None)
        return self.__get_predicted_labels(self.data_matrix,
                                           features_to_include)

    @time_it
    def __get_predicted_labels(self, data_matrix, features_to_include):
        """
        Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The
        data matrix is modified in place.
        It returns a new copy of data_matrix with "features_to_include" features.
        """
        feature_names = self.vectorizer.get_feature_names()
        for feature_vector in data_matrix:
            row = [0] * len(feature_names)
            column = range(len(feature_names))
            data = map(
                lambda feature_name: feature_vector[feature_name]
                if feature_name in feature_vector else 0, feature_names)
            feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column))))
            predicted_label = self.k_means_estimator.predict(
                feature_csr_matrix)
            feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0]

        expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix)
        if features_to_include:
            return self.__get_filtered_data_matrix(expanded_data_matrix,
                                                   features_to_include)
        else:
            return expanded_data_matrix

    @time_it
    def __get_filtered_data_matrix(self, data_matrix, features_to_include):
        """
        Removes all features except features_to_include
        """
        filtered_data_matrix = []
        for feature_vector in data_matrix:
            filtered_feature_vector = {}
            for feature_name in features_to_include:
                filtered_feature_vector[feature_name] = feature_vector[
                    feature_name]
            filtered_data_matrix.append(filtered_feature_vector)
        return filtered_data_matrix

    @time_it
    def __get_expanded_data_matrix(self, data_matrix):
        """
        Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new 
        modified copy is returned.
        """
        feature_names = self.vectorizer.get_feature_names()
        expanded_data_matrix = copy.deepcopy(data_matrix)
        for feature_vector in expanded_data_matrix:
            for feature_name in feature_names:
                if feature_name not in feature_vector:
                    feature_vector[feature_name] = 0
        return expanded_data_matrix

    @time_it
    def predict_labels_for_data(self, file_path, features_to_include=None):
        """
        This function reads the tweets of different users from the file at file_path and assigns the closest 
        cluster center to each user.
        It returns list of tuples of (user_id,predicted_label,latitude, longitude).
        """
        data_matrix = self.__get_data_matrix_from_file(file_path)
        return self.__get_predicted_labels(data_matrix, features_to_include)
示例#7
0
def vectorize(data):
    transformer = DictVectorizer()
    values= flatten([make_features(x) for x in  data])
    X = transformer.fit_transform([x["x"] for x in values]).toarray()
    Y = array([x["y"] for x in values])
    return (X, Y, values)
class KMeansEstimator:
    """
    This class reads the tweets of users from a file and builds cluster centers on that data. It also provides
    method for finding the closest cluster center of unseen data.
    """
    
    ADJECTIVE = 'JJ'
    
    """
    Feature keys used in clustering...
    """
    POLARITY_FEATURE_KEY = 'polarity'
    SUBJECTIVITY_FEATURE_KEY = 'subjectivity'
    TWEET_COUNT_FEATURE_KEY = 'tweetCount'
    """
    Features not considered for clustering...
    """
    USER_ID_FEATURE_KEY = 'userId'
    LONGITUDE_FEATURE_KEY = 'longitude'
    LATITUDE_FEATURE_KEY = 'latitude'
    
    
    """
    Predicted label feature name.
    """
    LABEL_FEATURE_KEY = 'label'

    RELEVENT_FEATURE_LIST = [USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY, LABEL_FEATURE_KEY]
    
    def __init__(self, tweet_file_path, no_of_clusters):
        """
        The constructor reads csv file and builds the data matrix.
        """
        self.np_extractor = ConllExtractor()
        self.pos_tagger = NLTKTagger()
        self.tweet_file_path = tweet_file_path
        self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path)
        self.vectorizer = DictVectorizer(sparse=True)
        self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters)
        
    @time_it
    def __get_data_matrix_from_file(self, tweet_file_path):
        """
        Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list
        of all feature vectors.
        """
        file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',')
        next(file_reader)
        data_matrix = []
        for row in file_reader:
            logging.info("Extracting features for user_id:%s", row[0])
            feature_vector = {}
            feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0])
            feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2])
            feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3])
            feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4])
            feature_vector.update(self.__get_features_from_tweet_text(row[1].decode('utf-8')))
            data_matrix.append(feature_vector)
            logging.info("Successfully extracted features for user_id:%s", row[0])
        return data_matrix
    
    @time_it
    def __get_features_from_tweet_text(self, tweet_text):
        """This function returns the following features from the tweet text:
        - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature.
        - Subjectivity and polarity as determined by TextBlob.
        :returns:  (key,value) map of all features found. 
        """
        text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger);
        adjective_map = dict(Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE)))
        polarity = text_blob.sentiment[0]
        subjectivity = text_blob.sentiment[1]
        return dict(adjective_map.items() + {self.POLARITY_FEATURE_KEY:polarity, self.SUBJECTIVITY_FEATURE_KEY:subjectivity}.items())
    
    @time_it
    def __get_clustering_data_matrix(self, data_matrix):
        """
        This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from
        the data matrix and returns a copy of the data matrix.
        """
        data_matrix_copy = copy.deepcopy(data_matrix)
        for feature_vector in data_matrix_copy:
            feature_vector.pop(self.USER_ID_FEATURE_KEY)
            feature_vector.pop(self.LATITUDE_FEATURE_KEY)
            feature_vector.pop(self.LONGITUDE_FEATURE_KEY)
        return data_matrix_copy


    @time_it
    def perform_clustering(self, features_to_include=None):
        """
        This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at
        "tweet_file_path".
        It returns list of feature vector, where each feature vector contains only "features_to_include" or all features
        if "features_to_include" is None.
        """
        clustering_data_matrix = self.__get_clustering_data_matrix(self.data_matrix)
        transformed_data_matrix = self.vectorizer.fit_transform(clustering_data_matrix)
        
        self.k_means_estimator.fit(transformed_data_matrix, y=None)
        return self.__get_predicted_labels(self.data_matrix, features_to_include)

    @time_it    
    def __get_predicted_labels(self, data_matrix, features_to_include):
        """
        Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The
        data matrix is modified in place.
        It returns a new copy of data_matrix with "features_to_include" features.
        """
        feature_names = self.vectorizer.get_feature_names()
        for feature_vector in data_matrix:
            row = [0] * len(feature_names)
            column = range(len(feature_names))
            data = map(lambda feature_name:feature_vector[feature_name] if feature_name in feature_vector else 0, feature_names)
            feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column))))
            predicted_label = self.k_means_estimator.predict(feature_csr_matrix)
            feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0]
        
        expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix)
        if features_to_include:
            return self.__get_filtered_data_matrix(expanded_data_matrix, features_to_include)
        else:
            return expanded_data_matrix

    @time_it
    def __get_filtered_data_matrix(self, data_matrix, features_to_include):
        """
        Removes all features except features_to_include
        """
        filtered_data_matrix = []
        for feature_vector in data_matrix:
            filtered_feature_vector = {}
            for feature_name in features_to_include:
                filtered_feature_vector[feature_name] = feature_vector[feature_name]
            filtered_data_matrix.append(filtered_feature_vector)
        return filtered_data_matrix
    
    @time_it
    def __get_expanded_data_matrix(self, data_matrix):
        """
        Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new 
        modified copy is returned.
        """
        feature_names = self.vectorizer.get_feature_names()
        expanded_data_matrix = copy.deepcopy(data_matrix)
        for feature_vector in expanded_data_matrix:
            for feature_name in feature_names:
                if feature_name not in feature_vector:
                    feature_vector[feature_name] = 0
        return expanded_data_matrix
    
    @time_it
    def predict_labels_for_data(self, file_path, features_to_include=None):
        """
        This function reads the tweets of different users from the file at file_path and assigns the closest 
        cluster center to each user.
        It returns list of tuples of (user_id,predicted_label,latitude, longitude).
        """
        data_matrix = self.__get_data_matrix_from_file(file_path)
        return self.__get_predicted_labels(data_matrix, features_to_include)
# categorical attributes

cat_train = X_train.drop(numeric_cols, axis=1)
cat_test = X_test.drop(numeric_cols, axis=1)

cat_train.fillna('NA', inplace=True)
cat_test.fillna('NA', inplace=True)

x_cat_train = cat_train.T.to_dict().values()
x_cat_test = cat_test.T.to_dict().values()

# vectorize (encode as one hot)

vectorizer = DictVectorizer(sparse=False)
vec_x_cat_train = vectorizer.fit_transform(x_cat_train)
vec_x_cat_test = vectorizer.transform(x_cat_test)

# build the feature vector

x_train = np.hstack((x_num_train, vec_x_cat_train))
x_test = np.hstack((x_num_test, vec_x_cat_test))


#clfLR = LogisticRegression().fit(x_train, y_train.values)
#pred = clfLR.predict(x_test)
#print classification_report(y_test.values, pred, digits=4)
#print accuracy_score(y_test.values, pred)

clfTree = tree.DecisionTreeClassifier().fit(x_train, y_train)
predict = clfTree.predict(x_test)
示例#10
0
class Model(object):
    """
    Class for abstracting the different classification models.
    """
    
    def __init__(self, train_tweets, train_targets, vect_options, tfidf_options, extra_params):
        self.grid_params = {
#                            'vect__ngram_range': [(1,1),(1,2),(2,2)],
#                      'tfidf__use_idf': (True,False),
#                      'tfidf__smooth_idf': (True, False),
#                      'tfidf__sublinear_tf': (True, False),
                      }
        
        self.grid_params = dict(self.grid_params.items()+extra_params.items())
        self.vect_options = vect_options
        self.tfidf_options = tfidf_options
        self.feature_set = {}
        self.train_tweets = train_tweets
        self.train_targets = train_targets
        self.only_text_features = False
        
    def train_on_feature_set(self, cross_validate=True, use_tfidf=True):
        """
        Performs training with the given model using the given feature set
        """
        #Establish document text feature vectors
        print "Vectorizing"
#        self.tokenizer = CountVectorizer().build_tokenizer()
        
        
        self.vect = CountVectorizer(**self.vect_options)
        self.tfidf_transformer = TfidfTransformer(**self.tfidf_options)
        self.dict_transformer = TfidfTransformer(**self.tfidf_options)
#        train_counts_tf = tfidf_transformer.fit_transform(train_counts)
        
        count_vector = self.vect.fit_transform([t.text for t in self.train_tweets])
        tfidf_count = self.tfidf_transformer.fit_transform(count_vector)
        if self.only_text_features:
            combined_vector = tfidf_count
        else:
            self.dict_vectorizer = DictVectorizer()
            dict_vector = self.dict_vectorizer.fit_transform(self.feature_set)
            
            f=codecs.open("feature_set.txt", "w", "utf8")
            for d in dict_vector:
                f.write(d.__str__())
            f.close()
            tfidf_dict = self.dict_transformer.fit_transform(dict_vector)
            f=codecs.open("feature_set_tdidf.txt", "w", "utf8")
            for d in tfidf_dict:
                f.write(d.__str__())
            f.close()
            combined_vector = sp.hstack([tfidf_count, tfidf_dict])
#        combined_features = FeatureUnion()
        #Crossvalidation
        cross_validation = StratifiedKFold(self.train_targets, n_folds=10)
        
        #Build a Pipeline with TFidfVectorizer and classifier
        pipeline_classifier = Pipeline([
#                                        ('vect', self.vect),
#                                    ('tfidf', self.tfidf_transformer),
                                    ('clf', self.classifier)
                                    ])
        
        #Perform grid search
        print "Performing grid search with classifier of instance ",str(self.classifier.__class__.__name__)
        self.grid = GridSearchCV(pipeline_classifier, self.grid_params, cv=cross_validation, refit=True, n_jobs=-1,verbose=1)

        self.grid.fit(combined_vector, self.train_targets)
        
        self.best_estimator = self.grid.best_estimator_
        self.best_parameters = self.grid.best_params_
        self.best_score = self.grid.best_score_
        
        
        print "Results for ",self.classifier.__class__.__name__
        print "Best params: ", self.best_parameters
        print "Best score: ", self.best_score
        
        print "Storing estimator... "
        utils.store_model(self.classifier.__class__.__name__, self.best_parameters, self.best_score)
        return self.grid
        
    def grid_search_on_text_features(self, cross_validate=True, file_postfix=""):
        """
        Performs a grid search using text features on the given dataset. Stores the parameters for the optimal classifier.
        """
        
        self.grid_params = {
                    'vect__ngram_range': [(1,1),(1,2),(2,2),(1,3),(2,3),(3,3),(1,4)],
              'vect__use_idf': (True,False),
              'vect__smooth_idf': (True, False),
              'vect__sublinear_tf': (True, False),
              'vect__max_df': (0.5,),
              }
        self.vect = TfidfVectorizer()

        cross_validation = StratifiedKFold(self.train_targets, n_folds=10)
        
        #Build a Pipeline with TFidfVectorizer and classifier
        pipeline_classifier = Pipeline([
                                        ('vect', self.vect),
                                    ('clf', self.classifier)]
                                       )
        
        #Perform grid search
        print "Performing grid search with classifier of instance ",str(self.classifier.__class__.__name__)
        self.grid = GridSearchCV(pipeline_classifier, self.grid_params, cv=cross_validation, refit=True, n_jobs=-1,verbose=1)

        self.grid.fit([t.text for t in self.train_tweets], self.train_targets)
        
        self.best_estimator = self.grid.best_estimator_
        self.best_parameters = self.grid.best_params_
        self.best_score = self.grid.best_score_
        
        
        print "Results for ",self.classifier.__class__.__name__
        print "Best params: ", self.best_parameters
        print "Best score: ", self.best_score
        
        print "Storing estimator... "        
        utils.store_model(self.classifier.__class__.__name__, self.best_parameters, self.best_score, file_postfix=file_postfix)
        return self.grid

    def classify(self, tweets, sentimentvalues=None):
        """
        Performs the classification process on list of tweets.
        """
        if sentimentvalues!=None:
            self.test_words_and_values = sentimentvalues
        count_vector = self.vect.transform([t.text for t in tweets])
        tfidf_count = self.tfidf_transformer.transform(count_vector)
        if self.only_text_features:
            combined_vector = tfidf_count
        else:
            dict_vector = self.dict_vectorizer.transform([features.get_feature_set(t, self.featureset, v) for t,v in zip(tweets, self.test_words_and_values)])
            tfidf_dict = self.dict_transformer.transform(dict_vector)
            combined_vector = sp.hstack([tfidf_count, tfidf_dict])
                
        predictions = self.best_estimator.predict(combined_vector)

        return predictions

    def classify_text(self, texts):
        """
        Performs classification with only text features.
        """
        
        count_vector = self.vect.transform([t for t in texts])
        text_vector = self.tfidf_transformer.transform(count_vector)
        predictions = self.best_estimator.predict(text_vector)

        return predictions
        
    def test_and_return_results(self, test_tweets, test_targets, sentimentvalues):
        """
        Tests the classifier on a given test set, and returns the accuracy, precision, recall, and f1 score.
        """
        self.test_words_and_values = sentimentvalues
        predictions = self.classify(test_tweets)
        binary_predictions = utils.reduce_targets(predictions)
        binary_test_targets = utils.reduce_targets(test_targets)
        
        accuracy = metrics.accuracy_score(binary_test_targets, binary_predictions)
        precision = metrics.precision_score(binary_test_targets, binary_predictions)
        recall = metrics.recall_score(binary_test_targets, binary_predictions)
        f1_score = metrics.f1_score(binary_test_targets, binary_predictions)
        print "Scores:  ", accuracy, precision, recall, f1_score
        
        return accuracy, precision, recall, f1_score
    
    def get_correctly_classified_tweets(self, tweets_and_sentiment):
        """
        Classifies the given set of tweets and returns the ones that were correctly classified.
        """
        tweets, sentimentvalues = zip(*tweets_and_sentiment)
        if sentimentvalues!=None:
            self.test_words_and_values = sentimentvalues
        count_vector = self.vect.transform([t.text for t in tweets])
        tfidf_count = self.tfidf_transformer.transform(count_vector)
        if self.only_text_features:
            combined_vector = tfidf_count
        else:
            dict_vector = self.dict_vectorizer.transform([features.get_feature_set(t, self.featureset, v) for t,v in zip(tweets, self.test_words_and_values)])
            tfidf_dict = self.dict_transformer.transform(dict_vector)
            combined_vector = sp.hstack([tfidf_count, tfidf_dict])
                
        predictions = self.best_estimator.predict(combined_vector)
        tweets, targets = utils.make_subjectivity_targets(tweets)
        #return the tweets where the target match prediction
        correct_tweets = []
        correct_sentimentvalues = []
        for i in xrange(len(tweets)):
            if predictions[i]==targets[i]:
                correct_tweets.append(tweets[i])
                correct_sentimentvalues.append(sentimentvalues[i])
        return correct_tweets, correct_sentimentvalues
    
    def set_feature_set(self, featureset, sentimentvalues):
        """
        Extracts and stores the given feature set for classification.
        """
        self.featureset = featureset
        if featureset=='SA' or featureset=='PA':
            self.only_text_features=True
            self.feature_set = {}
        else:
            words_and_values = sentimentvalues
            self.feature_set = [features.get_feature_set(t, self.featureset, v) for t,v in zip(self.train_tweets,words_and_values)]
        
                
示例#11
0
def vectorize(data):
    transformer = DictVectorizer()
    values = flatten([make_features(x) for x in data])
    X = transformer.fit_transform([x["x"] for x in values]).toarray()
    Y = array([x["y"] for x in values])
    return (X, Y, values)
示例#12
0
    if add_location:
        id_features[id][id_location[id]] = '1'
    id_features[id]['feature_count'] = float(id_feature_count[id])
    id_features[id]['event_count'] = id_event_count[id]
    id_features[id]['resource_count'] = id_resource_count[id]

train_ids = sorted(id_location_train.keys())
test_ids = sorted(id_location_test.keys())
train_features = [id_features[id] for id in train_ids]
test_features = [id_features[id] for id in test_ids]
labels = {'0': 0, '1': 1, '2': 2}
train_labels = [labels[id_severity_train[id]] for id in train_ids]
test_fake_labels = [train_labels[0]] * len(test_ids)
vectorizer = DictVectorizer()

X_train = vectorizer.fit_transform(train_features)
features = vectorizer.get_feature_names()
save_train_features = False
if save_train_features:
    np.savetxt('x_train.txt',
               X_train.toarray(),
               delimiter=',',
               header=','.join(features))

X_test = vectorizer.transform(test_features)

#scaler = prep.MinMaxScaler(feature_range=(0, 1), copy=True)
scaler = prep.StandardScaler(copy=True, with_mean=True, with_std=True)
X_train = scaler.fit_transform(X_train.toarray())
X_test = scaler.transform(X_test.toarray())
示例#13
0
        id_features[id][id_location[id]] = '1' 
    id_features[id]['feature_count'] = float(id_feature_count[id])
    id_features[id]['event_count'] = id_event_count[id]
    id_features[id]['resource_count'] = id_resource_count[id]
    
    
train_ids = sorted(id_location_train.keys())
test_ids = sorted(id_location_test.keys())
train_features = [id_features[id] for id in train_ids]
test_features = [id_features[id] for id in test_ids]
labels = {'0':0, '1':1, '2':2}
train_labels = [labels[id_severity_train[id]] for id in train_ids]
test_fake_labels = [train_labels[0]] * len(test_ids)
vectorizer = DictVectorizer()

X_train = vectorizer.fit_transform(train_features)
features = vectorizer.get_feature_names()
save_train_features = False
if save_train_features:
    np.savetxt('x_train.txt', X_train.toarray(), delimiter=',', header=','.join(features))

X_test = vectorizer.transform(test_features)

#scaler = prep.MinMaxScaler(feature_range=(0, 1), copy=True)
scaler = prep.StandardScaler(copy=True, with_mean=True, with_std=True)
X_train = scaler.fit_transform(X_train.toarray())
X_test = scaler.transform(X_test.toarray())

do_feature_elimination = False
if do_feature_elimination:
    estimator =  RandomForestClassifier(n_estimators=2000, criterion='entropy', max_depth=None,