def extract_features(self,
                         corpus_file_pos,
                         corpus_file_neg,
                         verbose=False):
        '''
        Extract features from positive and negative corpora

        Keyword arguments:
            corpus_file_pos (str): path to positive corpus
            corpus_file_neg (str): path to negative corpus
            verbose (bool): stdout verbosity

        Returns:
            list, tuple: list of extracted features and, depending on texts flag, the tokenized raw tweets
        '''
        res = []
        tweet_texts = []

        # extract features
        if verbose: print('extracting features...')

        for is_sarcastic in [True, False]:
            if verbose:
                print('   preprocessing samples with sarcastic=' +
                      str(is_sarcastic) + '...')
            # preprocess tweets
            if is_sarcastic:
                pipeline = Pipeline(corpus_file_pos,
                                    '../rsrc/de-tiger.map',
                                    verbose=verbose)
            else:
                pipeline = Pipeline(corpus_file_neg,
                                    '../rsrc/de-tiger.map',
                                    verbose=verbose)
            tweets_tkn, tweets_proc = pipeline.process()
            if verbose: print('   extracting features...')
            # extract features from tweets
            for tweet_index in range(len(tweets_tkn)):
                ext_features = self.extract_features_from_tweet(
                    tweets_tkn[tweet_index], tweets_proc[tweet_index],
                    is_sarcastic)
                res.append(ext_features)
            for text in tweets_tkn:
                tweet_texts.append(text)

        if self.texts:
            return res, tweet_texts
        else:
            return res
示例#2
0
    def preprocess(self):
        """
        Preprocessing based on Scheffler et. al. German Twitter Preprocessing
        """
        tokenizedTweets_writer = open('./daten/tokenized_tweets.txt', 'w')
        preprocTweets_writer = open('./daten/preprocessed_tweets.txt', 'w')

        pp = Pipeline(self.this_file, "./autosarkasmus/rsrc/de-tiger.map")
        tweets_tkn, tweets_proc, labels = pp.process()
        assert (len(tweets_tkn) == len(tweets_proc) == len(labels))

        # write preprocessing results to file
        for x in range(len(tweets_proc)):
            t_tweet = (" ").join(tweets_tkn[x])
            p_tweet = (" ").join(
                [str(x) + "/" + str(y) for x, y in tweets_proc[x]])
            label = labels[x]
            tokenizedTweets_writer.write(t_tweet + "\t" + label + "\n")
            preprocTweets_writer.write(p_tweet + "\t" + label + "\n")
示例#3
0
    def preprocess(self):

        tokenizedTweets_writer = open(
            './daten/tokenized_tweets_normalized.txt', 'w')
        preprocTweets_writer = open(
            './daten/preprocessed_tweets_normalized.txt', 'w')

        pp = Pipeline(self.this_file, "./autosarkasmus/rsrc/de-tiger.map")
        tweets_tkn, tweets_proc, labels = pp.process()
        assert (len(tweets_tkn) == len(tweets_proc) == len(labels))

        # filter stopwords + normalize tokens
        lemmatizer = IWNLPWrapper(
            lemmatizer_path='daten/IWNLP.Lemmatizer_20170501.json')
        lemmatized_tokens = []
        for x in range(len(tweets_tkn)):
            tweet = []
            for token in tweets_tkn[x]:
                if token.lower() in stopwords.words('german'):
                    continue
                try:
                    lemma = lemmatizer.lemmatize_plain(token, ignore_case=True)
                    if (lemma):
                        tweet.append(lemma[0])
                    else:
                        tweet.append(token)

                except Exception as e:
                    print(e)

            lemmatized_tokens.append(tweet)

        assert (len(lemmatized_tokens) == len(tweets_proc) == len(labels))

        # write preprocessing results to file
        for x in range(len(lemmatized_tokens)):
            t_tweet = (" ").join(lemmatized_tokens[x])
            p_tweet = (" ").join(
                [str(x) + "/" + str(y) for x, y in tweets_proc[x]])
            label = labels[x]
            tokenizedTweets_writer.write(t_tweet + "\t" + label + "\n")
            preprocTweets_writer.write(p_tweet + "\t" + label + "\n")
示例#4
0
    print('setting up data...')
    data = []
    if args.model == 'rnn':
        for is_sarcastic in [True, False]:
            print('  preprocessing samples with sarcastic=' +
                  str(is_sarcastic) + '...')
            # preprocess tweets
            if is_sarcastic:
                pipeline = Pipeline(args.corpus_file_pos,
                                    '../rsrc/de-tiger.map',
                                    verbose=True)
            else:
                pipeline = Pipeline(args.corpus_file_neg,
                                    '../rsrc/de-tiger.map',
                                    verbose=True)
            tweets_tkn, tweets_proc = pipeline.process()
            for tweet_proc in tweets_proc:
                data.append({'tweet': tweet_proc, 'class': is_sarcastic})

    if args.model in ['svm', 'mlp']:
        feature_extractor = FeatureExtractor(features, feature_order)
        data = feature_extractor.extract_features(
            args.corpus_file_pos, args.corpus_file_neg,
            verbose=True)  # extract features from training corpora

    # classifier setup
    classifiers = []
    if args.model == 'svm':
        classifiers.append({
            'name':
            'svm_classifier',
示例#5
0
# -*- coding: utf-8 -*-
import sys
import os.path

sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from autosarkasmus.preprocessor.pipeline import Pipeline

pp = Pipeline("test.txt", "../rsrc/de-tiger.map")
tweets, tagged = pp.process()
for i in range(len(tweets)):
    print(" ".join(tweets[i]))
    output = ""
    for token, tag in tagged[i]:
        output += "{}|{} ".format(token, tag)
    print(output.strip())
    print()