Exemplo n.º 1
0
    def extract_features(self,
                         corpus_file_pos,
                         corpus_file_neg,
                         verbose=False):
        '''
        Extract features from positive and negative corpora

        Keyword arguments:
            corpus_file_pos (str): path to positive corpus
            corpus_file_neg (str): path to negative corpus
            verbose (bool): stdout verbosity

        Returns:
            list, tuple: list of extracted features and, depending on texts flag, the tokenized raw tweets
        '''
        res = []
        tweet_texts = []

        # extract features
        if verbose: print('extracting features...')

        for is_sarcastic in [True, False]:
            if verbose:
                print('   preprocessing samples with sarcastic=' +
                      str(is_sarcastic) + '...')
            # preprocess tweets
            if is_sarcastic:
                pipeline = Pipeline(corpus_file_pos,
                                    '../rsrc/de-tiger.map',
                                    verbose=verbose)
            else:
                pipeline = Pipeline(corpus_file_neg,
                                    '../rsrc/de-tiger.map',
                                    verbose=verbose)
            tweets_tkn, tweets_proc = pipeline.process()
            if verbose: print('   extracting features...')
            # extract features from tweets
            for tweet_index in range(len(tweets_tkn)):
                ext_features = self.extract_features_from_tweet(
                    tweets_tkn[tweet_index], tweets_proc[tweet_index],
                    is_sarcastic)
                res.append(ext_features)
            for text in tweets_tkn:
                tweet_texts.append(text)

        if self.texts:
            return res, tweet_texts
        else:
            return res
Exemplo n.º 2
0
    def preprocess(self):
        """
        Preprocessing based on Scheffler et. al. German Twitter Preprocessing
        """
        tokenizedTweets_writer = open('./daten/tokenized_tweets.txt', 'w')
        preprocTweets_writer = open('./daten/preprocessed_tweets.txt', 'w')

        pp = Pipeline(self.this_file, "./autosarkasmus/rsrc/de-tiger.map")
        tweets_tkn, tweets_proc, labels = pp.process()
        assert (len(tweets_tkn) == len(tweets_proc) == len(labels))

        # write preprocessing results to file
        for x in range(len(tweets_proc)):
            t_tweet = (" ").join(tweets_tkn[x])
            p_tweet = (" ").join(
                [str(x) + "/" + str(y) for x, y in tweets_proc[x]])
            label = labels[x]
            tokenizedTweets_writer.write(t_tweet + "\t" + label + "\n")
            preprocTweets_writer.write(p_tweet + "\t" + label + "\n")
Exemplo n.º 3
0
    def preprocess(self):

        tokenizedTweets_writer = open(
            './daten/tokenized_tweets_normalized.txt', 'w')
        preprocTweets_writer = open(
            './daten/preprocessed_tweets_normalized.txt', 'w')

        pp = Pipeline(self.this_file, "./autosarkasmus/rsrc/de-tiger.map")
        tweets_tkn, tweets_proc, labels = pp.process()
        assert (len(tweets_tkn) == len(tweets_proc) == len(labels))

        # filter stopwords + normalize tokens
        lemmatizer = IWNLPWrapper(
            lemmatizer_path='daten/IWNLP.Lemmatizer_20170501.json')
        lemmatized_tokens = []
        for x in range(len(tweets_tkn)):
            tweet = []
            for token in tweets_tkn[x]:
                if token.lower() in stopwords.words('german'):
                    continue
                try:
                    lemma = lemmatizer.lemmatize_plain(token, ignore_case=True)
                    if (lemma):
                        tweet.append(lemma[0])
                    else:
                        tweet.append(token)

                except Exception as e:
                    print(e)

            lemmatized_tokens.append(tweet)

        assert (len(lemmatized_tokens) == len(tweets_proc) == len(labels))

        # write preprocessing results to file
        for x in range(len(lemmatized_tokens)):
            t_tweet = (" ").join(lemmatized_tokens[x])
            p_tweet = (" ").join(
                [str(x) + "/" + str(y) for x, y in tweets_proc[x]])
            label = labels[x]
            tokenizedTweets_writer.write(t_tweet + "\t" + label + "\n")
            preprocTweets_writer.write(p_tweet + "\t" + label + "\n")
Exemplo n.º 4
0
    def __init__(self, config_path, verbose=False):
        '''
        Constructor of AutosarkasmusBot

        Keyword arguments:
            config_path (str): path to the json configuration file
            verbose (str): stdout verbosity
        '''
        self.verbose = verbose
        self._load_config(config_path)  # load config from file
        # Twitter API parameters
        self.oauth = tweepy.OAuthHandler(self.CONSUMER_KEY,
                                         self.CONSUMER_SECRET)
        self.oauth.set_access_token(self.ACCESS_KEY, self.ACCESS_SECRET)
        self.twitter_api = tweepy.API(self.oauth)
        # tweet processing
        self.pipeline = Pipeline(self.training_corpus_positive_path,
                                 self.pipeline_tagger_mapping_path)
        self.feature_extractor = FeatureExtractor(self.features,
                                                  self.feature_order)
        self.classifier = MultiLayerPerceptronClassifier(self.feature_order,
                                                         verbose=self.verbose)
Exemplo n.º 5
0
    embeddings = pickle.load(open(args.embeddings_file, 'rb'),
                             encoding='bytes')
    print('setting up features...')
    features, feature_order = setup_features()

    # data setup
    print('setting up data...')
    data = []
    if args.model == 'rnn':
        for is_sarcastic in [True, False]:
            print('  preprocessing samples with sarcastic=' +
                  str(is_sarcastic) + '...')
            # preprocess tweets
            if is_sarcastic:
                pipeline = Pipeline(args.corpus_file_pos,
                                    '../rsrc/de-tiger.map',
                                    verbose=True)
            else:
                pipeline = Pipeline(args.corpus_file_neg,
                                    '../rsrc/de-tiger.map',
                                    verbose=True)
            tweets_tkn, tweets_proc = pipeline.process()
            for tweet_proc in tweets_proc:
                data.append({'tweet': tweet_proc, 'class': is_sarcastic})

    if args.model in ['svm', 'mlp']:
        feature_extractor = FeatureExtractor(features, feature_order)
        data = feature_extractor.extract_features(
            args.corpus_file_pos, args.corpus_file_neg,
            verbose=True)  # extract features from training corpora
Exemplo n.º 6
0
'''
Activates our pipeline to prepare an output file for tagger evaluation.

Input: gold_corpus.raw
Output: our_tagger_output.tagged
'''

import sys
import os.path
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from autosarkasmus.preprocessor.pipeline import Pipeline

pipe = Pipeline('foo', '../rsrc/de-tiger.map') # creates pipeline with mapping from STTS to universal

'''
Reads gold corpus
'''
corpus = []
with open('gold_corpus.raw', 'r') as fop:
    for line in fop:
        corpus.append([token.strip() for token in line.split(' ')])

tweets = [] # tweets are being read separately with this list
tweets_tagged = [] # this is the tagger output
# example: [('und', 'CONJ'), ('wieder', 'ADV'), ... , ('hashtag', 'NN'), ('user', 'NN)]

for tweet in corpus: # iterate through tweets in gold corpus
    tweet_tagged = pipe.tag(tweet)
    newLemma = []
    # example: [(tuple(('hashtag','HASH')) if lemma[0] == 'hashtag' else lemma) for lemma in tweet_tagged]
    for lemma in tweet_tagged:
Exemplo n.º 7
0
    # feature setup
    print('setting up features...')
    features, feature_order = setup_features()

    # resource setup
    print('setting up resources...')
    resources = {}

    # ARFF document setup
    arff_doc = ARFFDocument('Sarkasmuserkennung', features, feature_order)

    # extract features
    print('extracting features...')

    print('   preprocessing positive samples...')
    pipeline = Pipeline(args.corpus_file_pos, '../rsrc/de-tiger.map')
    tweets_tkn, tweets_proc = pipeline.process()
    print('   extracting features for positive samples...')
    for tweet_index in range(len(tweets_tkn)):
        ext_features = extract_features(tweets_tkn[tweet_index], tweets_proc[tweet_index], True, features, feature_order, resources)
        arff_doc.add_data(ext_features)
    print('   writing to files (for safety)...')
    pipeline.write_file(tweets_tkn, args.corpus_file_pos + '.tkn')
    pipeline.write_file(tweets_proc, args.corpus_file_pos + '.proc')
    arff_doc.generate_document(args.output_file + '.pos')

    print('   preprocessing negative samples...')
    pipeline = Pipeline(args.corpus_file_neg, '../rsrc/de-negra.map')
    tweets_tkn, tweets_proc = pipeline.process()
    print('   extracting features for negative samples...')
    for tweet_index in range(len(tweets_tkn)):
Exemplo n.º 8
0
    # argument parsing
    arg_parser = argparse.ArgumentParser(
        description='Feature Extraction for the Autosarkasmus Baseline')
    arg_parser.add_argument('training_pos_path',
                            help='path to the positive training corpus')
    arg_parser.add_argument('training_neg_path',
                            help='path to the negative training corpus')
    arg_parser.add_argument('tagger_mapping_path',
                            help='path to the tagger mapping')
    arg_parser.add_argument('-f', '--input_file', help='path to input file')
    args = arg_parser.parse_args()

    print('\n - Autosarkasmus Demo -\n')

    # preprocessing pipeline setup
    pipeline = Pipeline(args.training_pos_path, args.tagger_mapping_path)

    # feature setup
    print('setting up features...')
    features, feature_order = setup_features()

    # feature extraction
    print('setting up feature extractor...')
    feature_extractor = FeatureExtractor(features, feature_order)
    tweets_ext = feature_extractor.extract_features(args.training_pos_path,
                                                    args.training_neg_path,
                                                    verbose=True)
    print('extracted features from ' + str(len(tweets_ext)) + ' tweets.')

    # svm training
    print('training classifier...')
Exemplo n.º 9
0
class AutosarkasmusBot:
    '''
    A bot for the @autosarkasmus twitter account

    Processes sarcasm classification requests and corresponding feedback.
    '''
    def __init__(self, config_path, verbose=False):
        '''
        Constructor of AutosarkasmusBot

        Keyword arguments:
            config_path (str): path to the json configuration file
            verbose (str): stdout verbosity
        '''
        self.verbose = verbose
        self._load_config(config_path)  # load config from file
        # Twitter API parameters
        self.oauth = tweepy.OAuthHandler(self.CONSUMER_KEY,
                                         self.CONSUMER_SECRET)
        self.oauth.set_access_token(self.ACCESS_KEY, self.ACCESS_SECRET)
        self.twitter_api = tweepy.API(self.oauth)
        # tweet processing
        self.pipeline = Pipeline(self.training_corpus_positive_path,
                                 self.pipeline_tagger_mapping_path)
        self.feature_extractor = FeatureExtractor(self.features,
                                                  self.feature_order)
        self.classifier = MultiLayerPerceptronClassifier(self.feature_order,
                                                         verbose=self.verbose)

    def _load_config(self, config_path):
        '''
        Loads configuration from JSON file

        Keyword arguments:
            config_path (str): path to the json configuration file
        '''
        config_json = {}
        try:
            with open(config_path, 'r', encoding='utf8') as fop:
                config_json = json.load(fop)
        except Exception as ex:
            print('Error: Could not read config file at "' + config_path +
                  '".')
            print(ex)
        self.screen_name = config_json.get('SCREEN_NAME',
                                           None)  # screen name used by the bot
        self.enquiry_pattern = config_json.get(
            'ENQUIRY_PATTERN', None)  # pattern that matches an enquiry
        self.enquiry_responses = config_json.get('ENQUIRY_RESPONSES', {
            'positive': [],
            'negative': []
        })  # responses to an enquiry
        self.feedback_responses = config_json.get('FEEDBACK_RESPONSES', {
            'positive': [],
            'negative': []
        })  # responses to feedback
        self.CONSUMER_KEY = config_json.get('CONSUMER_KEY',
                                            None)  # Twitter API consumer key
        self.CONSUMER_SECRET = config_json.get(
            'CONSUMER_SECRET', None)  # Twitter API consumer token
        self.ACCESS_KEY = config_json.get(
            'ACCESS_KEY', None)  # Twitter API application access key
        self.ACCESS_SECRET = config_json.get(
            'ACCESS_SECRET', None)  # Twitter API application secret key
        self.pipeline_tagger_mapping_path = config_json.get(
            'PIPELINE_TAGGER_MAPPING_PATH',
            None)  # path to tagger mapping file
        self.training_corpus_positive_path = config_json.get(
            'TRAINING_CORPUS_POSITIVE_PATH',
            None)  # path to corpus with positive training data
        self.training_corpus_negative_path = config_json.get(
            'TRAINING_CORPUS_NEGATIVE_PATH',
            None)  # path to corpus with negative training data
        self.history_path = config_json.get('HISTORY_PATH',
                                            None)  # path to the bot's history
        # load history to memory
        self.history = {}
        try:
            with open(self.history_path, 'r', encoding='utf8') as fop:
                self.history = json.load(fop)
        except Exception as ex:
            print('Error: Could not read history file at "' + config_path +
                  '".')
            print(ex)
        # load features to memory
        self.features = {}
        self.feature_order = []
        features_json = config_json.get('FEATURES', [])
        for feature_json in features_json:
            self.feature_order.append(feature_json['key'])
            self.features[feature_json['key']] = feature_json['values']

    def train(self):
        '''
        Train the bot on given data
        '''
        tweets_ext = self.feature_extractor.extract_features(
            self.training_corpus_positive_path,
            self.training_corpus_negative_path,
            verbose=self.verbose)  # extract features from training corpora
        if self.verbose: print('training classifier...')
        self.classifier.train(tweets_ext)

    def classify_tweet(self, tweet_raw):
        '''
        Classifies a single tweet

        Keyword arguments:
            tweet_raw (str): text of the tweet to classify

        Returns:
            dict: the extracted features of the tweet in addition to class
        '''
        if self.verbose: print('classifying tweet: "' + tweet_raw + '"')
        tweet_tkn, tweet_proc = self.pipeline.process_tweet(
            tweet_raw)  # preprocess the raw tweet
        if self.verbose: print(str(tweet_tkn) + '\n' + str(tweet_proc))
        tweet_ext = self.feature_extractor.extract_features_from_tweet(
            tweet_tkn, tweet_proc,
            True)  # extract features from tweet (sarcasm is True per default)
        del (tweet_ext['class']
             )  # delete class since it is only a default value
        if self.verbose:
            print([(feature, tweet_ext[feature])
                   for feature in self.feature_order
                   if tweet_ext.get(feature, 0) != 0
                   ])  # print all features != 0
        tweet_class = self.classifier.classify([tweet_ext
                                                ])  # classify the tweet
        if self.verbose:
            print('classified with sarcasm:', tweet_class[0]['class'])
        return tweet_class[0]

    def is_sarcastic_tweet(self, tweet_raw):
        '''
        Identifies sarcasm in a single tweet

        Keyword arguments:
            tweet_raw (str): text of the tweet to classify

        Returns:
            bool: whether the tweet was classified as being sarcastic
        '''
        return self.classify_tweet(tweet_raw)['class']

    def is_valid_enquiry(self, tweet_json):
        '''
        Checks whether the given tweet is a valid enquiry

        Keyword arguments:
            tweet_json (dict): JSON representation of the tweet object

        Returns:
            bool: whether the tweet is a valid enquiry
        '''
        res = False
        # check if tweet matches enquiry pattern, no case matching
        if re.match(self.enquiry_pattern, tweet_json['text'], re.IGNORECASE):
            # check if tweet wasn't authored by the bot itself
            if tweet_json['user']['screen_name'] != self.screen_name:
                # check for retweeted enquiry
                if not tweet_json['retweeted']:
                    # check if tweet is a reply to anything
                    if tweet_json['in_reply_to_status_id']:
                        res = True
        return res

    def is_valid_feedback(self, tweet_json):
        '''
        Checks whether the given tweet is valid feedback

        Keyword arguments:
            tweet_json (dict): JSON representation of the tweet object

        Returns:
            bool: whether the tweet is valid feedback
        '''
        res = False
        # check if tweet wasn't authored by the bot itself
        if tweet_json['user']['screen_name'] != self.screen_name:
            # check for retweet
            if not tweet_json['retweeted']:
                # check if tweet is reply to a tweet sent by the bot
                if tweet_json['in_reply_to_status_id']:
                    reply_tweet_status = self.twitter_api.get_status(
                        tweet_json['in_reply_to_status_id'])
                    if reply_tweet_status.user.screen_name == self.screen_name:
                        # check if bot tweeted a classification
                        for enquiry_response in self.enquiry_responses[
                                'positive'] + self.enquiry_responses[
                                    'negative']:
                            if enquiry_response in reply_tweet_status.text:
                                res = True
                                break
        return res

    def gen_enquiry_response(self, recipient, tweet_is_sarcastic):
        '''
        Generate response to an enquiry

        Keyword arguments:
            recipient (str): user handle of the addressee
            tweet_is_sarcastic (bool): whether the tweet was classified as sarcastic

        Returns:
            str: twitter-ready response
        '''
        response = '😓'  # default response
        # pick fitting response at random
        if tweet_is_sarcastic:
            response = self.enquiry_responses['positive'][randint(
                0,
                len(self.enquiry_responses['positive']) - 1)]
        else:
            response = self.enquiry_responses['negative'][randint(
                0,
                len(self.enquiry_responses['negative']) - 1)]
        # prepend the recipient
        response = '@' + recipient + ' ' + response + ' Korrekt? (j/n)'
        # trim tweet if necessary
        if len(response) > 140:
            response = response[:137] + '...'
        return response

    def gen_feedback_response(self, recipient, correctly_classified):
        '''
        Generate response to feedback

        Keyword arguments:
            recipient (str): user handle of the addressee
            correctly_classified (bool): whether the tweet was correctly classified

        Returns:
            str: twitter-ready response
        '''
        response = 'Danke! ^^'  # default response
        # pick fitting response at random
        if correctly_classified:
            response = self.feedback_responses['positive'][randint(
                0,
                len(self.feedback_responses['positive']) - 1)]
        else:
            response = self.feedback_responses['negative'][randint(
                0,
                len(self.feedback_responses['negative']) - 1)]
        # prepend the recipient
        response = '@' + recipient + ' ' + response
        # trim tweet if necessary
        if len(response) > 140:
            response = response[:137] + '...'
        return response

    def respond(self, tweet_json):
        '''
        Respond to a tweet

        Responses are generated for classification enquiries and feedback to said enquiries

        Keyword arguments:
            tweet_json (dict): JSON representation of the tweet to respond to
        '''
        if self.verbose:
            print('mentioned by @' + tweet_json['user']['screen_name'] +
                  '\n"' + tweet_json['text'] + '"')
        bot_response = None

        if self.is_valid_enquiry(
                tweet_json):  # if tweet is a classification enquiry
            if tweet_json[
                    'in_reply_to_status_id'] not in self.history:  # check if tweet has already been classifed
                eval_tweet_status = self.twitter_api.get_status(
                    tweet_json['in_reply_to_status_id']
                )  # get tweet to be classified (as Tweepy.Status object)
                eval_tweet_sarcastic = self.is_sarcastic_tweet(
                    eval_tweet_status.text)  # classify the tweet
                bot_response = self.gen_enquiry_response(
                    tweet_json['user']['screen_name'],
                    eval_tweet_sarcastic)  # generate response accordingly
                # save tweet and its classification in history
                self.history[eval_tweet_status.id] = eval_tweet_status._json
                self.history[eval_tweet_status.
                             id]['sarcasm_predicted'] = eval_tweet_sarcastic
                self.save_history()

        elif self.is_valid_feedback(tweet_json):  # if tweet is feedback
            correctly_classified = None
            # analyze feedback
            if re.match(r'.*?\b(j(a|o|ep)?|y(es|o)?)\b.*?', tweet_json['text'],
                        re.IGNORECASE):
                correctly_classified = True
            elif re.match(r'.*?\b(n(e(in)?|o(pe)?)?)\b.*?', tweet_json['text'],
                          re.IGNORECASE):
                correctly_classified = False
            if correctly_classified is not None:  # if feedback could be parsed
                # follow the tweet trail back to the source (feedback -> classification -> enquiry -> classified_tweet)
                class_tweet_status = self.twitter_api.get_status(
                    tweet_json['in_reply_to_status_id'])
                enq_tweet_status = self.twitter_api.get_status(
                    class_tweet_status.in_reply_to_status_id)
                eval_tweet_status = self.twitter_api.get_status(
                    enq_tweet_status.in_reply_to_status_id)
                # save the evaluation
                if 'sarcasm_actual' not in self.history[eval_tweet_status.id]:
                    bot_response = self.gen_feedback_response(
                        tweet_json['user']['screen_name'],
                        correctly_classified)
                    self.history[
                        eval_tweet_status.id]['sarcasm_actual'] = self.history[
                            eval_tweet_status.
                            id]['sarcasm_predicted'] and correctly_classified
                    self.save_history()

        if bot_response:
            self.twitter_api.update_status(
                bot_response, tweet_json['id'])  # post response to twitter
        if self.verbose and bot_response:
            print('responded with: "' + str(bot_response) + '"')

    def save_history(self):
        '''
        Saves the bot's history to file
        '''
        try:
            with open(self.history_path, 'w', encoding='utf8') as fop:
                json.dump(self.history, fop)
        except Exception as ex:
            print('Error: Could not save history to "' + self.history_path +
                  '"')
            print(ex)
Exemplo n.º 10
0
# -*- coding: utf-8 -*-
import sys
import os.path
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

import pickle, numpy

from autosarkasmus.preprocessor.pipeline import Pipeline

data = []
for is_sarcastic in [True, False]:
    break
    print('preprocessing samples with sarcastic='+str(is_sarcastic)+'...')
    # preprocess tweets
    if is_sarcastic:
        pipeline = Pipeline('../corpus/txt/reviewed_corpus_files/tweets_pos_3099random.txt', '../rsrc/de-tiger.map', verbose=True)
    else:
        pipeline = Pipeline('../corpus/txt/reviewed_corpus_files/tweets_not-pos_3099random.txt', '../rsrc/de-tiger.map', verbose=True)
    tweets_tkn, tweets_proc = pipeline.process()
    for tweet_proc in tweets_proc:
        data.append(
            {
                'tweet': tweet_proc,
                'class': is_sarcastic
            }
        )
data = pickle.load(open('../rsrc/tweets_proc_debug.pkl', 'rb'), encoding='bytes')
# pickle.dump(data, open('../rsrc/tweets_proc_debug.pkl', 'wb'))

embeddings = pickle.load(open('../rsrc/polyglot-de-dict.pkl', 'rb'), encoding='bytes')
Exemplo n.º 11
0
# -*- coding: utf-8 -*-
import sys
import os.path

sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from autosarkasmus.preprocessor.pipeline import Pipeline

pp = Pipeline("test.txt", "../rsrc/de-tiger.map")
tweets, tagged = pp.process()
for i in range(len(tweets)):
    print(" ".join(tweets[i]))
    output = ""
    for token, tag in tagged[i]:
        output += "{}|{} ".format(token, tag)
    print(output.strip())
    print()