class MetaDataSentimentAnalisis: def __init__(self, kernel_type): self.kernel = kernel_type self.tokenizer = Tokenizer() self.__init_classifier(kernel_type) pass def __init_classifier(self, kernel_type): if kernel_type == 'rbf': self.classifier = svm.SVC(C=1, gamma=0.0000001) elif kernel_type == 'linear': self.classifier = svm.SVC(kernel='linear') elif kernel_type == 'liblinear': self.classifier = svm.LinearSVC() else: self.classifier = svm.SVC() self.vectorizer = TfidfVectorizer(min_df=5, max_df = 0.8, sublinear_tf=True, use_idf=True) def train_text(self, train_data_path, metadata_path, train = True, parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}, store = False, storepath = ""): parser = TweetParser() tweets = parser.parse(train_data_path, metadata_path) train_data = [] train_labels = [] clean_train = "" polarity = "NONE" if store and not train: print ("store needs train ... train=True") train = True for tweet in tweets: clean_train = self.tokenizer.cleanText(tweet.content) train_data.append(clean_train) polarity = self.checkPolarity(tweet.polarity) train_labels.append(polarity) #print clean_train #print polarity # Create feature vectors train_vectors = self.vectorizer.fit_transform(train_data) if train: Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma' : gammas} self.classifier = grid_search.GridSearchCV(self.classifier, param_grid, cv=3, n_jobs=4, verbose=1) self.classifier.fit(train_vectors, train_labels) if store: joblib.dump(self.classifier, storepath) return train_labels def svc_param_selection(self,X, y, nfolds): Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma' : gammas} grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds, n_jobs=4, verbose=1) grid_search.fit(X, y) grid_search.best_params_ return grid_search.best_params_ def train_features(self, train_data_path, metadata_path, features= [], train = True, store = True, storepath = ""): (X, train_labels) = self.buildFeaturesFromCorpus(train_data_path, metadata_path, features) if train: Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma' : gammas} self.classifier = grid_search.GridSearchCV(self.classifier, param_grid, cv=3, n_jobs=4, verbose=1) self.classifier.fit(X, train_labels) if store: joblib.dump(self.classifier, storepath) return train_labels def buildFeaturesFromCorpus(self,train_data_path, metadata_path, features= [], test = False): parser = TweetParser() tweets = parser.parse(train_data_path, metadata_path) train_data = [] train_labels = [] featuresDict ={} for tweet in tweets: if not tweet.jsonData: continue for feature in features: if feature not in featuresDict: featuresDict[feature] = tweet_feature(feature)#creamos la feature if feature == "geo" or feature == "place" or feature == "favorited": if feature in tweet.jsonData: pass #print str(type(tweet.jsonData[feature])) #print tweet.jsonData[feature] if feature in tweet.jsonData: dataValue = tweet.jsonData[feature] featuresDict[feature].data.append(dataValue) featuresDict[feature].count +=1 featuresDict[feature].totalCount += 1 else: if featuresDict[feature].type == bool: featuresDict[feature].data.append(False) elif featuresDict[feature].type == int: featuresDict[feature].data.append(0) else: featuresDict[feature].data.append("null") featuresDict[feature].totalCount += 1 polarity = self.checkPolarity(tweet.polarity) train_labels.append(polarity) for feature in features: featuresDict[feature].print_stats() if feature == "text": Xt = self.buildFeatureDataMatrix(featuresDict[feature], test) train_data.append(Xt) else: train_data.append(self.buildFeatureDataMatrix(featuresDict[feature], test)) if train_data: #Xm = scipy.sparse.csc_matrix(train_data) #Xm = Xm.transpose(True) # X : sparse matrix, [n_samples, n_features] # Tf-idf-weighted document-term matrix. #print Xt.shape #print Xm.shape X = scipy.sparse.hstack(train_data) else: X = Xt return (X,train_labels) def test(self, test_data_path = "", metadata_path = "", load = False, model = ""): parser = TweetParser() tweets = parser.parse(test_data_path, metadata_path) test_data = [] for tweet in tweets: test_data.append(self.tokenizer.cleanText(tweet.content)) test_vectors = self.vectorizer.transform(test_data) if load and model: self.classifier = joblib.load(model) predictions = self.classifier.predict(test_vectors) return predictions def test_features(self, test_data_path = "", metadata_path = "", features= [], load = False, model = ""): (X, train_labels) = self.buildFeaturesFromCorpus(test_data_path, metadata_path, features, True) if load and model: self.classifier = joblib.load(model) predictions = self.classifier.predict(X) return (predictions, train_labels) def predict(self, test_data_path = "", metadata_path = "", model = ""): parser = TweetParser() tweets = parser.parse(test_data_path, metadata_path) test_data = [] for tweet in tweets: test_data.append(self.tokenizer.cleanText(tweet.content)) test_vectors = self.vectorizer.transform(test_data) self.classifier = joblib.load(model) predictions = self.classifier.predict(test_vectors) for text, prediction in test_data, predictions: print text print prediction def checkPolarity(self, polarity_elements): polarity = 'NONE' if polarity_elements: for polarity_element in polarity_elements: polarity = polarity_element if not polarity == 'NONE': break return polarity def buildFeatureDataMatrix(self, feature, test = False): featureData = [] for data in feature.data: if feature.name == "text": featureData.append(self.tokenizer.cleanText(data)) else: featureData.append(data) if feature.name == "text": if not test: text_features = self.vectorizer.fit_transform(featureData) else: text_features=self.vectorizer.transform(featureData) return text_features else: return scipy.sparse.csc_matrix(featureData).transpose(True)
class MetaDataSentimentAnalisis: def __init__(self, kernel_type): self.kernel = kernel_type self.tokenizer = Tokenizer() self.__init_classifier(kernel_type) pass def __init_classifier(self, kernel_type): if kernel_type == 'rbf': self.classifier = svm.SVC(C=1, gamma=0.0000001) elif kernel_type == 'linear': self.classifier = svm.SVC(kernel='linear') elif kernel_type == 'liblinear': self.classifier = svm.LinearSVC() else: self.classifier = svm.SVC() self.vectorizer = TfidfVectorizer(min_df=5, max_df=0.8, sublinear_tf=True, use_idf=True) def train_text(self, train_data_path, metadata_path, train=True, parameters={ 'kernel': ('linear', 'rbf'), 'C': [1, 10] }, store=False, storepath=""): parser = TweetParser() tweets = parser.parse(train_data_path, metadata_path) train_data = [] train_labels = [] clean_train = "" polarity = "NONE" if store and not train: print("store needs train ... train=True") train = True for tweet in tweets: clean_train = self.tokenizer.cleanText(tweet.content) train_data.append(clean_train) polarity = self.checkPolarity(tweet.polarity) train_labels.append(polarity) #print clean_train #print polarity # Create feature vectors train_vectors = self.vectorizer.fit_transform(train_data) if train: Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma': gammas} self.classifier = grid_search.GridSearchCV(self.classifier, param_grid, cv=3, n_jobs=4, verbose=1) self.classifier.fit(train_vectors, train_labels) if store: joblib.dump(self.classifier, storepath) return train_labels def svc_param_selection(self, X, y, nfolds): Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma': gammas} grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds, n_jobs=4, verbose=1) grid_search.fit(X, y) grid_search.best_params_ return grid_search.best_params_ def train_features(self, train_data_path, metadata_path, features=[], train=True, store=True, storepath=""): (X, train_labels) = self.buildFeaturesFromCorpus(train_data_path, metadata_path, features) if train: Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma': gammas} self.classifier = grid_search.GridSearchCV(self.classifier, param_grid, cv=3, n_jobs=4, verbose=1) self.classifier.fit(X, train_labels) if store: joblib.dump(self.classifier, storepath) return train_labels def buildFeaturesFromCorpus(self, train_data_path, metadata_path, features=[], test=False): parser = TweetParser() tweets = parser.parse(train_data_path, metadata_path) train_data = [] train_labels = [] featuresDict = {} for tweet in tweets: if not tweet.jsonData: continue for feature in features: if feature not in featuresDict: featuresDict[feature] = tweet_feature( feature) #creamos la feature if feature == "geo" or feature == "place" or feature == "favorited": if feature in tweet.jsonData: pass #print str(type(tweet.jsonData[feature])) #print tweet.jsonData[feature] if feature in tweet.jsonData: dataValue = tweet.jsonData[feature] featuresDict[feature].data.append(dataValue) featuresDict[feature].count += 1 featuresDict[feature].totalCount += 1 else: if featuresDict[feature].type == bool: featuresDict[feature].data.append(False) elif featuresDict[feature].type == int: featuresDict[feature].data.append(0) else: featuresDict[feature].data.append("null") featuresDict[feature].totalCount += 1 polarity = self.checkPolarity(tweet.polarity) train_labels.append(polarity) for feature in features: featuresDict[feature].print_stats() if feature == "text": Xt = self.buildFeatureDataMatrix(featuresDict[feature], test) train_data.append(Xt) else: train_data.append( self.buildFeatureDataMatrix(featuresDict[feature], test)) if train_data: #Xm = scipy.sparse.csc_matrix(train_data) #Xm = Xm.transpose(True) # X : sparse matrix, [n_samples, n_features] # Tf-idf-weighted document-term matrix. #print Xt.shape #print Xm.shape X = scipy.sparse.hstack(train_data) else: X = Xt return (X, train_labels) def test(self, test_data_path="", metadata_path="", load=False, model=""): parser = TweetParser() tweets = parser.parse(test_data_path, metadata_path) test_data = [] for tweet in tweets: test_data.append(self.tokenizer.cleanText(tweet.content)) test_vectors = self.vectorizer.transform(test_data) if load and model: self.classifier = joblib.load(model) predictions = self.classifier.predict(test_vectors) return predictions def test_features(self, test_data_path="", metadata_path="", features=[], load=False, model=""): (X, train_labels) = self.buildFeaturesFromCorpus(test_data_path, metadata_path, features, True) if load and model: self.classifier = joblib.load(model) predictions = self.classifier.predict(X) return (predictions, train_labels) def predict(self, test_data_path="", metadata_path="", model=""): parser = TweetParser() tweets = parser.parse(test_data_path, metadata_path) test_data = [] for tweet in tweets: test_data.append(self.tokenizer.cleanText(tweet.content)) test_vectors = self.vectorizer.transform(test_data) self.classifier = joblib.load(model) predictions = self.classifier.predict(test_vectors) for text, prediction in test_data, predictions: print text print prediction def checkPolarity(self, polarity_elements): polarity = 'NONE' if polarity_elements: for polarity_element in polarity_elements: polarity = polarity_element if not polarity == 'NONE': break return polarity def buildFeatureDataMatrix(self, feature, test=False): featureData = [] for data in feature.data: if feature.name == "text": featureData.append(self.tokenizer.cleanText(data)) else: featureData.append(data) if feature.name == "text": if not test: text_features = self.vectorizer.fit_transform(featureData) else: text_features = self.vectorizer.transform(featureData) return text_features else: return scipy.sparse.csc_matrix(featureData).transpose(True)