class BaseClassifier(tweepy.StreamListener): def __init__(self): # Create the objects to prevent repeated constructions. self.remover = StopwordRemover() self.remover.build_lists() self.tokenizer = SimpleTokenizer() self.normalizer = VocabNormalizer() self.normalizer.build_map() super(BaseClassifier, self).__init__() def on_error(self, status_code): print "Error: " + repr(status_code) return False def on_status(self, status): # Filter out links and mentions first. text_filter = TweetTextFilter() text = text_filter.filter(status.text) # Tokenize the text. tokens = self.tokenizer.tokenize(text) tokens = self.remover.remove_all(tokens) # Normalize the vocabulary. tokens = self.normalizer.normalize(tokens)
def __init__(self): # Create the objects to prevent repeated constructions. self.remover = StopwordRemover() self.remover.build_lists() self.tokenizer = SimpleTokenizer() self.normalizer = VocabNormalizer() self.normalizer.build_map() super(BaseClassifier, self).__init__()
def __init__(self, feature_selector, tokenizer=NLTKTokenizer, **kwargs): # Set the feature selector. self.feature_selector_class = feature_selector # Create the objects to prevent repeated constructions. self.text_filter = TweetTextFilter() self.remover = StopwordRemover() self.remover.build_lists() self.tokenizer = tokenizer() self.normalizer = VocabNormalizer() self.normalizer.build_map() self.max_features = config.max_features # Initialize some state. self.training_data = dict() self.trained = False self.results = list() super(BaseClassifier, self).__init__()
class BaseClassifier(tweepy.StreamListener): def __init__(self, feature_selector, tokenizer=NLTKTokenizer, **kwargs): # Set the feature selector. self.feature_selector_class = feature_selector # Create the objects to prevent repeated constructions. self.text_filter = TweetTextFilter() self.remover = StopwordRemover() self.remover.build_lists() self.tokenizer = tokenizer() self.normalizer = VocabNormalizer() self.normalizer.build_map() self.max_features = config.max_features # Initialize some state. self.training_data = dict() self.trained = False self.results = list() super(BaseClassifier, self).__init__() def train(self, training_sets): # Don't allow retraining. if self.trained: raise RuntimeError('Classifier is already trained') for set_name in training_sets: training_file = training_sets[set_name] set_data = list() self.logger.info('Reading training set "{0}" ({1})...'.format( set_name, training_file)) # Read JSON from the set. f = open(training_file, 'r') for line in f: status = json.loads(line) term_vector = self.get_term_vector(status) set_data.append(term_vector) self.training_data[set_name] = set_data self.logger.info('Reading training sets complete.') self.set_trained(True) # Create the feature selector. self.feature_selector = self.feature_selector_class(self.training_data) def get_data_count(self): data_count = 0 for category_name in self.training_data: category_data = self.training_data[category_name] data_count += len(category_data) return data_count def normalize_term_vector(self, term_vector, features): norm = list() for feature in features: if feature in term_vector: # norm.append([feature, 1]) norm.append(1) else: # norm.append([feature, 0]) norm.append(0) # array = numpy.array(norm) # return array[:,1] return numpy.array(norm) def set_max_features(self, max_features): self.max_features = max_features def get_max_features(self): return self.max_features def set_trained(self, trained): self.trained = trained def get_trained(self): return self.trained def get_term_vector(self, status): # Filter out links and mentions first. if hasattr(status, '__getitem__'): text = self.text_filter.filter(status['text']) else: text = self.text_filter.filter(status.text) # Tokenize the text. tokens = self.tokenizer.tokenize(text) tokens = self.remover.remove_all(tokens) # Normalize the vocabulary. tokens = self.normalizer.normalize(tokens) # Create the term vector. term_vector = dict() for token in tokens: if token in term_vector: term_vector[token] += 1 else: term_vector[token] = 1 return term_vector def on_error(self, status_code): print "Error: " + repr(status_code) return False def on_status(self, status): if self.trained is False: raise ClassifierNotTrainedException('Classifier must be trained ' 'before use.') def publish_result(self, status, categories): self.print_categories(status, categories) self.results.append(categories) def get_results(self): return self.results def print_categories(self, status, categories): if not config.quiet_mode: if hasattr(status, '__getitem__'): status_text = status['text'] else: status_text = status.text print u'{0}: ({1})'.format(categories, status_text)