def evaluate(clf, test_file, undersample=False, calc_semeval_f1=True, export_cm_file=None, verbose=False): '''Evaluate classifier on a given test set. ''' if verbose: print('evaluating...') test = pd.read_csv(test_file, delimiter='\t', encoding='utf-8', header=0, names=['text', 'label']) if undersample: test = undersampler.undersample(test, 'label') Y = np.asarray(test['label'], dtype="|S8") # labels and their counts count = Counter() count.update(Y) labels = count.keys() if verbose: print('num of labels:') print(count) del count # predictions pred = clf.predict(test['text']) # calculate accuracy acc = accuracy_score(Y, pred) # calculate f1 score f1 = f1_score(Y, pred, average='micro') # calculate semeval f1 semeval_f1 = 0.0 if calc_semeval_f1: try: semeval_f1 = semeval_senti_f1(pred, Y) except: semeval_f1 = 0.0 else: semeval_f1 = 0.0 # display print('SGD:') print('\tacc=%f\n\tsemeval_f1=%f\n\tmicro_f1=%f\n' % (acc, semeval_f1, f1)) # confusion matrix cm = confusion_matrix(Y, pred) print(cm) if export_cm_file: if verbose: print('Saving confusion matrix to %s' % export_cm_file) plot_cm(cm, labels, export_cm_file)
def train(train_file, undersample=False, ngram=(1, 4), min_df=1, max_df=1.0, dim_reduction=None, n_dims=0, n_iter=200, class_weight='auto', n_jobs=1, verbose=False): '''Train a classifier ''' if verbose: print('loading...') train = pd.read_csv(train_file, delimiter='\t', encoding='utf-8', header=0, names=['text', 'label']) if undersample != 0: if verbose: print('undersampling (n={})...'.format(undersample)) train = undersampler.undersample(train, 'label', undersample) X = train['text'] Y = np.asarray(train['label'], dtype="|S8") del train if verbose: count = Counter() count.update(Y) print('num of labels:') print(count) del count # create pipeline clf = None # basic parameters params = {'vect__token_pattern': r"\S+", 'vect__ngram_range': ngram, 'vect__min_df': min_df, 'vect__max_df': max_df, 'vect__binary': True, 'sgd__n_iter': n_iter, 'sgd__shuffle': True, 'sgd__class_weight': class_weight, 'sgd__n_jobs': n_jobs } # No dimensionality reduction if dim_reduction is None: clf = Pipeline([('vect', CountVectorizer()), ('sgd', SGDClassifier())]) # TruncatedSVD (LSA) elif dim_reduction == 'svd': clf = Pipeline([('vect', CountVectorizer()), ('svd', TruncatedSVD()), ('norm', Normalizer()), ('sgd', SGDClassifier())]) params['svd__n_components'] = n_dims params['norm__copy'] = False # Hashing Vectorizer else: clf = Pipeline([('vect', HashingVectorizer()), ('sgd', SGDClassifier())]) params['vect__n_features'] = n_dims del params['vect__max_df'] del params['vect__min_df'] clf.set_params(**params) if verbose: print('fitting...') clf.fit(X, Y) return clf
def train(train_file, undersample=False, ngram=(1, 4), min_df=1, max_df=1.0, dim_reduction=None, n_dims=0, n_iter=200, class_weight='auto', n_jobs=1, verbose=False): '''Train a classifier ''' if verbose: print('loading...') train = pd.read_csv(train_file, delimiter='\t', encoding='utf-8', header=0, names=['text', 'label']) if undersample != 0: if verbose: print('undersampling (n={})...'.format(undersample)) train = undersampler.undersample(train, 'label', undersample) X = train['text'] Y = np.asarray(train['label'], dtype="|S8") del train if verbose: count = Counter() count.update(Y) print('num of labels:') print(count) del count # create pipeline clf = None # basic parameters params = { 'vect__token_pattern': r"\S+", 'vect__ngram_range': ngram, 'vect__min_df': min_df, 'vect__max_df': max_df, 'vect__binary': True, 'sgd__n_iter': n_iter, 'sgd__shuffle': True, 'sgd__class_weight': class_weight, 'sgd__n_jobs': n_jobs } # No dimensionality reduction if dim_reduction is None: clf = Pipeline([('vect', CountVectorizer()), ('sgd', SGDClassifier())]) # TruncatedSVD (LSA) elif dim_reduction == 'svd': clf = Pipeline([('vect', CountVectorizer()), ('svd', TruncatedSVD()), ('norm', Normalizer()), ('sgd', SGDClassifier())]) params['svd__n_components'] = n_dims params['norm__copy'] = False # Hashing Vectorizer else: clf = Pipeline([('vect', HashingVectorizer()), ('sgd', SGDClassifier())]) params['vect__n_features'] = n_dims del params['vect__max_df'] del params['vect__min_df'] clf.set_params(**params) if verbose: print('fitting...') clf.fit(X, Y) return clf