def run(dataset, hyperparameters, metrics, fname=None):
    # # Load Resources
    word2vec = None
    if hyperparameters['model'] != 'rand':
        word2vec = load_word2vec()
    # # Load Dataset
    df = load_dataset(dataset[0], **dataset[1])
    # # Preprocess
    df['clean_tweets'] = df.tweet.apply(
        TweetPreprocessor(normalize=['link', 'mention']).preprocess)
    df['tokens'] = df.clean_tweets.apply(TweetTokenizer().tokenize)
    X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split(
        df.tokens, df.label)
    # # Train
    clf = NeuralNetClassifier(module=TextCNN,
                              corpus=df.tokens,
                              word_vectors=word2vec,
                              metrics=metrics,
                              **hyperparameters)
    clf.fit(X_train, y_train, validation_data=(X_dev, y_dev))
    # # Predict
    y_pred = clf.predict(X_test)
    # # Evaluate
    pprint(
        dict(dataset=dataset,
             hyperparameters=hyperparameters,
             scores={
                 scorer: get_score_func(scorer)(y_test, y_pred)
                 for scorer in metrics
             }))
    # # Save to file
    X_test['pred'] = y_pred
    X_test.to_excel(scratch_path('predictions_%s.xlsx' % fname))
Exemplo n.º 2
0
def run(dataset, features, word_embedding, metrics, fname):
    if dataset.lower().startswith('f'):
        df = load_fdcl18()
    else:
        df = load_dwmw17()
    tqdm.pandas(desc='Preprocessing Progress: ')
    df['clean_tweet'] = df.tweet.progress_apply(TweetPreprocessor(normalize=['link', 'mention']).preprocess, )
    tqdm.pandas(desc='Tokenizing Progress: ')
    df['tokens'] = df.clean_tweet.progress_apply(TweetTokenizer().tokenize)
    # #
    # Feature Extraction
    # tfidf_pipeline
    ff = []
    if 'tfidf_vectorizer' in features:
        tfidf_kwargs = dict(
            tokenizer=TweetTokenizer().tokenize,
            stop_words=stopwords,
            min_df=.0025,
            max_df=0.25,
            ngram_range=(1, 3)
        )
        ff += [('tfidf_vectorizer', TfidfVectorizer(**tfidf_kwargs), 'clean_tweet')]
    # framenet_pipeline
    if 'framenet_pipeline' in features:
        count_vectorizer = ('count_vectorizer', CountVectorizer())
        truncated_svd = ('truncated_svd', TruncatedSVD(algorithm='randomized', n_components=10))
        ff += [('framenet_pipeline', Pipeline([count_vectorizer, truncated_svd]), 'framenet')]
    # mean_embedding
    if 'mean_embedding' in features:
        ff += [('mean_embedding', mean_embedding(word_embedding), 'tokens')]
    # hatebase_vectorizer
    if 'hatebase_vectorizer' in features:
        ff += [('hatebase_vectorizer', HatebaseVectorizer(features=features['hatebase_vectorizer']), 'clean_tweet')]
    # transfer_vectorizer
    if 'transfer_vectorizer' in features:
        hyper_params = features['transfer_vectorizer']
        hyper_params['module'] = TextCNN
        hyper_params['corpus'] = df.tokens
        hyper_params['word_vectors'] = word_embedding
        # """ # Cross-validate and save predictions
        args = [NeuralNetClassifier, hyper_params, ['conv_%i' % i for i in range(3)], False]
        ff += [('transfer_vectorizer', TransferVectorizer(*args), 'tokens')]
    # # Estimator
    pipeline = Pipeline([('column_transformer', ColumnTransformer(ff)), ('clf', LinearSVC())])
    # # Evaluation (Cross Validation)
    # """ # Cross-validate and save predictions
    cv = CrossValidator(pipeline, n_splits=5, scoring=metrics)
    df['predictions'], cv_results = cv.cross_val_predict(df, df.label, return_scores=True)
    # """ Print Scores
    pprint({'dataset': dataset, 'features': features})
    pprint(cv_results)
    scores = {}
    for scorer in metrics:
        scores[scorer] = ['%.2f' % (np.average(cv_results[scorer]) * 100) + ',']
    pprint(scores, type='table')
    # """ Save Predictions #
    df.to_excel(scratch_path('predictions_%s_%s.xlsx' % (dataset, fname)))
Exemplo n.º 3
0
from scripts.utils import scratch_path
from tklearn.datasets import load_fdcl18, load_dwmw17
from tklearn.model_selection import CrossValidator
from tklearn.neural_network import NeuralNetClassifier
from tklearn.neural_network.model import TextCNN
from tklearn.preprocessing.tweet import TweetPreprocessor
from tklearn.text.word_vec import load_word2vec
from tklearn.utils import pprint

DATASET = 'FDCL18'

if __name__ == '__main__':
    # Load Dataset and Extract Features
    if DATASET.lower().startswith('f'):
        df = load_fdcl18(num_classes=2)
        pprint({'dataset': 'FDCL18(num_classes=2)'})
    else:
        df = load_dwmw17(num_classes=2)
        pprint({'dataset': 'DWMW17(num_classes=2)'})
    df['clean_tweets'] = df.tweet.apply(TweetPreprocessor(normalize=['link', 'mention']).preprocess)
    df['tokens'] = df.clean_tweets.apply(TweetTokenizer().tokenize)
    # Load Resources
    word2vec = load_word2vec()
    # Hyperparameters
    kwargs = {
        'model': 'multichannel',
        'epoch': 100,
        'learning_rate': 0.01,
        'max_sent_len': 50,
        'batch_size': 50,
        # 'word_dim': 300,
Exemplo n.º 4
0
def run(dataset, features, word2vec, metrics, fname=None):
    if dataset == 'fdcl18':
        df1 = load_fdcl18(num_classes=2)
        df2 = load_dwmw17(num_classes=2)
        df2 = df2.drop(
            ['count', 'hate_speech', 'offensive_language', 'neither'], axis=1)
    else:
        df1 = load_dwmw17(num_classes=2)
        df2 = load_fdcl18(num_classes=2)
        df1 = df1.drop(
            ['count', 'hate_speech', 'offensive_language', 'neither'], axis=1)
    # # Preprocessing
    preprocess = TweetPreprocessor(normalize=['link', 'mention']).preprocess
    tokenize = TweetTokenizer().tokenize
    # # # DF 1 - Preprocessing
    tqdm.pandas(desc='Preprocessing Progress: ')
    df1['clean_tweet'] = df1.tweet.progress_apply(preprocess)
    tqdm.pandas(desc='Tokenizing Progress: ')
    df1['tokens'] = df1.clean_tweet.progress_apply(tokenize)
    # # # DF 2 - Preprocessing
    tqdm.pandas(desc='Preprocessing Progress: ')
    df2['clean_tweet'] = df2.tweet.progress_apply(preprocess)
    tqdm.pandas(desc='Tokenizing Progress: ')
    df2['tokens'] = df2.clean_tweet.progress_apply(tokenize)
    # #
    # # Feature Extraction
    # # # tfidf_pipeline
    ff = []
    if 'tfidf_vectorizer' in features:
        tfidf_kwargs = dict(tokenizer=TweetTokenizer().tokenize,
                            stop_words=stopwords,
                            min_df=.0025,
                            max_df=0.25,
                            ngram_range=(1, 3))
        ff += [('tfidf_vectorizer', TfidfVectorizer(**tfidf_kwargs),
                'clean_tweet')]
    # # # framenet_pipeline
    if 'framenet_pipeline' in features:
        count_vectorizer = ('count_vectorizer', CountVectorizer())
        truncated_svd = ('truncated_svd',
                         TruncatedSVD(algorithm='randomized', n_components=10))
        ff += [('framenet_pipeline',
                Pipeline([count_vectorizer, truncated_svd]), 'framenet')]
    # # # mean_embedding
    if 'mean_embedding' in features:
        ff += [('mean_embedding', mean_embedding(word2vec), 'tokens')]
    # # # hatebase_vectorizer
    if 'hatebase_vectorizer' in features:
        ff += [('hatebase_vectorizer',
                HatebaseVectorizer(features=features['hatebase_vectorizer']),
                'clean_tweet')]
    # # # transfer_vectorizer
    if 'transfer_vectorizer' in features:
        hyper_params = features['transfer_vectorizer']
        hyper_params['module'] = TextCNN
        hyper_params['corpus'] = df1.tokens
        hyper_params['word_vectors'] = word2vec
        # """ # Cross-validate and save predictions
        args = [
            NeuralNetClassifier, hyper_params,
            ['conv_%i' % i for i in range(3)], False
        ]
        ff += [('transfer_vectorizer', TransferVectorizer(*args), 'tokens')]
    # # # estimator
    pipeline = Pipeline([('column_transformer', ColumnTransformer(ff)),
                         ('clf', LinearSVC())])
    # # Grid Search``
    # param_grid = [
    #     {'clf__C': [0.1, 1, 10, 50], 'classifier': linear_svc},
    #     # {'classifier': sgd_classifier},
    # ]
    # gs = GridSearchCV(pipeline, param_grid, cv=5)
    # result = gs.fit(df, df.label).predict(df)
    # # Evaluation
    pipeline.fit(df1, df1.label)
    y_true, y_pred = df2.label, pipeline.predict(df2)
    # df2['predictions'] = y_pred
    # """ Print Scores
    pprint({'dataset': dataset, 'features': features})
    scores = {}
    for scorer in metrics:
        scores[scorer] = [get_score_func(scorer)(y_true, y_pred)]
    pprint(scores, type='table')