def _test_percents(self, human_classified_pickle, language): 'This method returns ntuple containing (matches, false_positive, false_negative, unknown)' human_classified = HumanClassification(human_classified_pickle) human_classified.load() entry_count = len(human_classified.classification) true_positive = 0.0 true_negative = 0.0 matches = 0.0 false_positive = 0.0 false_negative = 0.0 unknown = 0.0 for entry_id in human_classified.classification: processed_entry = self.db.get_entry_by_id(entry_id) probability = self.classify(processed_entry.original_entry, language) if probability < self._low: if not human_classified.classification[entry_id]: matches += 1 true_negative += 1 else: false_negative += 1 elif probability >= self._high: if human_classified.classification[entry_id]: matches += 1 true_positive += 1 else: false_positive += 1 else: unknown += 1 return (matches, true_positive, true_negative, false_positive, false_negative, unknown, entry_count)
def human_classify(self, output_pickle, language): 'This method creates output_pickle file containing user defined classifications of entries. May be used for creating test data.' self.db.connect(user='******', database='meco', host='localhost', port=5432) new_human_classify = HumanClassification(output_pickle) new_human_classify.load() try: for entry in self.db.entries(language=language, entry_count=None, entry_offset=0): # when entry was allready processed skip if entry.id in new_human_classify.classification: continue print 'Original entry: \n"' + entry.original_entry + '"\n automatic classification = ' + str( self.classify(entry.original_entry, language)) automatic_classification = self.classify( entry.original_entry, language) if automatic_classification < self._low: auto = 'n' continue # TODO: odstranit elif automatic_classification >= self._high: auto = 'y' else: auto = '?' answer = raw_input('Is this entry relevant? (y/n/?/END))[' + auto + ']: ') if answer == 'y': new_human_classify.classification[entry.id] = True elif answer == 'n': new_human_classify.classification[entry.id] = False elif answer == 'END': break else: if automatic_classification < self._low: new_human_classify.classification[entry.id] = False elif automatic_classification >= self._high: new_human_classify.classification[entry.id] = True else: new_human_classify.classification[entry.id] = None print 'Cassified count = ' + str( len(new_human_classify.classification)) except KeyboardInterrupt: pass new_human_classify.store()
def _test_corelation(self, human_classified_pickle, language): 'This method prints corelation between user defined input in human_classified_pickle and automatic classification.' # # covariance # | # C(X,Y) E(XY) - E(X)E(Y) # corelation = ------------------ = ------------------------------------------- , a = E(XY), b = E(X), c = E(Y), d,= E(X^2), e = E(Y^2) # d(X)d(Y) sqrt(E(X^2) - E(X)^2) sqrt(E(Y^2) - E(Y)^2) # | # standard deviations # # X - automatically calculated probabilities # Y - human input probabilities # human_classified = HumanClassification(human_classified_pickle) human_classified.load() entry_count = len(human_classified.classification) a = 0.0 b = 0.0 c = 0.0 d = 0.0 e = 0.0 for entry_id in human_classified.classification: processed_entry = self.db.get_entry_by_id(entry_id) probability_auto = self.classify(processed_entry.original_entry, language) if human_classified.classification[entry_id]: probability_human = self.HUMAN_RATING_PROBABILITY else: probability_human = (1 - self.HUMAN_RATING_PROBABILITY) a += probability_human * probability_auto b += probability_auto c += probability_human d += probability_auto * probability_auto e += probability_human * probability_human # E() values a /= entry_count b /= entry_count c /= entry_count d /= entry_count e /= entry_count return (a - (b * c)) / (sqrt(d - (b * b)) * sqrt(e - (c * c)))
def run_tests(self, input_file, language): 'Method for running tests on input file and get time elapsed for classification of one entry' self.db.connect(user='******', database='meco', host='localhost', port=5432) tmp = HumanClassification(input_file) tmp.load() self._logger.info('Running tests...') tests = Tests() tests.set_test_len(len(tmp.classification)) tests.set_train_len(len(self.human.classification)) tests.set_train_positive_len( self.human.get_positively_classified_count(language)) tests.set_train_negative_len( self.human.get_negatively_classified_count(language)) self._logger.info('Calculating corelation...') tests.set_corelation(self._test_corelation(input_file, language)) self._logger.info( 'Calculating percentage of classification accuracy...') tests.set_percents(self._test_percents(input_file, language)) print tests
def __init__(self, low=0.5, high=0.5): # classification thresholds self._low = float(low) self._high = float(high) # add and setup logger self._logger = logging.getLogger() logging.basicConfig(level=logging.DEBUG) # db connection self.db = Connection() # load info about allready classified entries self._logger.info('Loading Allready classified entries...') self.human = HumanClassification( '/mnt/minerva1/nlp/projects/twitter_classification/TwitterClassifier/pickles/HumanClassification' ) self.human.load() # load database of words self._logger.info('Loading word dictionary...') self.word_dict = WordDictionary( '/mnt/minerva1/nlp/projects/twitter_classification/TwitterClassifier/pickles/WordDictionary' ) self.word_dict.load() # timer self._timer = timeit.Timer()