예제 #1
0
def evaluate(clf, test_file, undersample=False, calc_semeval_f1=True,
             export_cm_file=None, verbose=False):
    '''Evaluate classifier on a given test set.
    '''
    if verbose:
        print('evaluating...')

    test = pd.read_csv(test_file, delimiter='\t', encoding='utf-8', header=0,
                       names=['text', 'label'])
    if undersample:
        test = undersampler.undersample(test, 'label')

    Y = np.asarray(test['label'], dtype="|S8")


    # labels and their counts
    count = Counter()
    count.update(Y)
    labels = count.keys()
    if verbose:
        print('num of labels:')
        print(count)
    del count

    # predictions
    pred = clf.predict(test['text'])

    # calculate accuracy
    acc = accuracy_score(Y, pred)

    # calculate f1 score
    f1 = f1_score(Y, pred, average='micro')

    # calculate semeval f1
    semeval_f1 = 0.0
    if calc_semeval_f1:
        try:
            semeval_f1 = semeval_senti_f1(pred, Y)
        except:
            semeval_f1 = 0.0
    else:
        semeval_f1 = 0.0

    # display
    print('SGD:')
    print('\tacc=%f\n\tsemeval_f1=%f\n\tmicro_f1=%f\n' % (acc, semeval_f1, f1))

    # confusion matrix
    cm = confusion_matrix(Y, pred)
    print(cm)
    
    if export_cm_file:
        if verbose:
            print('Saving confusion matrix to %s' % export_cm_file)
        plot_cm(cm, labels, export_cm_file)
예제 #2
0
def train(train_file, undersample=False, ngram=(1, 4), min_df=1, max_df=1.0,
          dim_reduction=None, n_dims=0, n_iter=200, class_weight='auto',
          n_jobs=1, verbose=False):
    '''Train a classifier
    '''
    if verbose:
        print('loading...')

    train = pd.read_csv(train_file, delimiter='\t', encoding='utf-8', header=0,
                        names=['text', 'label'])
    if undersample != 0:
        if verbose:
            print('undersampling (n={})...'.format(undersample))
        train = undersampler.undersample(train, 'label', undersample)

    X = train['text']
    Y = np.asarray(train['label'], dtype="|S8")
    del train

    if verbose:
        count = Counter()
        count.update(Y)
        print('num of labels:')
        print(count)
        del count

    # create pipeline
    clf = None

    # basic parameters
    params = {'vect__token_pattern': r"\S+",
              'vect__ngram_range': ngram,
              'vect__min_df': min_df,
              'vect__max_df': max_df,
              'vect__binary': True,
              'sgd__n_iter': n_iter,
              'sgd__shuffle': True,
              'sgd__class_weight': class_weight,
              'sgd__n_jobs': n_jobs
              }

    # No dimensionality reduction
    if dim_reduction is None:
        clf = Pipeline([('vect', CountVectorizer()), ('sgd', SGDClassifier())])
    # TruncatedSVD (LSA)
    elif dim_reduction == 'svd':
        clf = Pipeline([('vect', CountVectorizer()), ('svd', TruncatedSVD()),
                        ('norm', Normalizer()), ('sgd', SGDClassifier())])
        params['svd__n_components'] = n_dims
        params['norm__copy'] = False
    # Hashing Vectorizer
    else:
        clf = Pipeline([('vect', HashingVectorizer()),
                        ('sgd', SGDClassifier())])
        params['vect__n_features'] = n_dims
        del params['vect__max_df']
        del params['vect__min_df']

    clf.set_params(**params)

    if verbose:
        print('fitting...')

    clf.fit(X, Y)

    return clf
예제 #3
0
def train(train_file,
          undersample=False,
          ngram=(1, 4),
          min_df=1,
          max_df=1.0,
          dim_reduction=None,
          n_dims=0,
          n_iter=200,
          class_weight='auto',
          n_jobs=1,
          verbose=False):
    '''Train a classifier
    '''
    if verbose:
        print('loading...')

    train = pd.read_csv(train_file,
                        delimiter='\t',
                        encoding='utf-8',
                        header=0,
                        names=['text', 'label'])
    if undersample != 0:
        if verbose:
            print('undersampling (n={})...'.format(undersample))
        train = undersampler.undersample(train, 'label', undersample)

    X = train['text']
    Y = np.asarray(train['label'], dtype="|S8")
    del train

    if verbose:
        count = Counter()
        count.update(Y)
        print('num of labels:')
        print(count)
        del count

    # create pipeline
    clf = None

    # basic parameters
    params = {
        'vect__token_pattern': r"\S+",
        'vect__ngram_range': ngram,
        'vect__min_df': min_df,
        'vect__max_df': max_df,
        'vect__binary': True,
        'sgd__n_iter': n_iter,
        'sgd__shuffle': True,
        'sgd__class_weight': class_weight,
        'sgd__n_jobs': n_jobs
    }

    # No dimensionality reduction
    if dim_reduction is None:
        clf = Pipeline([('vect', CountVectorizer()), ('sgd', SGDClassifier())])
    # TruncatedSVD (LSA)
    elif dim_reduction == 'svd':
        clf = Pipeline([('vect', CountVectorizer()), ('svd', TruncatedSVD()),
                        ('norm', Normalizer()), ('sgd', SGDClassifier())])
        params['svd__n_components'] = n_dims
        params['norm__copy'] = False
    # Hashing Vectorizer
    else:
        clf = Pipeline([('vect', HashingVectorizer()),
                        ('sgd', SGDClassifier())])
        params['vect__n_features'] = n_dims
        del params['vect__max_df']
        del params['vect__min_df']

    clf.set_params(**params)

    if verbose:
        print('fitting...')

    clf.fit(X, Y)

    return clf
예제 #4
0
def evaluate(clf,
             test_file,
             undersample=False,
             calc_semeval_f1=True,
             export_cm_file=None,
             verbose=False):
    '''Evaluate classifier on a given test set.
    '''
    if verbose:
        print('evaluating...')

    test = pd.read_csv(test_file,
                       delimiter='\t',
                       encoding='utf-8',
                       header=0,
                       names=['text', 'label'])
    if undersample:
        test = undersampler.undersample(test, 'label')

    Y = np.asarray(test['label'], dtype="|S8")

    # labels and their counts
    count = Counter()
    count.update(Y)
    labels = count.keys()
    if verbose:
        print('num of labels:')
        print(count)
    del count

    # predictions
    pred = clf.predict(test['text'])

    # calculate accuracy
    acc = accuracy_score(Y, pred)

    # calculate f1 score
    f1 = f1_score(Y, pred, average='micro')

    # calculate semeval f1
    semeval_f1 = 0.0
    if calc_semeval_f1:
        try:
            semeval_f1 = semeval_senti_f1(pred, Y)
        except:
            semeval_f1 = 0.0
    else:
        semeval_f1 = 0.0

    # display
    print('SGD:')
    print('\tacc=%f\n\tsemeval_f1=%f\n\tmicro_f1=%f\n' % (acc, semeval_f1, f1))

    # confusion matrix
    cm = confusion_matrix(Y, pred)
    print(cm)

    if export_cm_file:
        if verbose:
            print('Saving confusion matrix to %s' % export_cm_file)
        plot_cm(cm, labels, export_cm_file)