def extract_features(self, corpus_file_pos, corpus_file_neg, verbose=False): ''' Extract features from positive and negative corpora Keyword arguments: corpus_file_pos (str): path to positive corpus corpus_file_neg (str): path to negative corpus verbose (bool): stdout verbosity Returns: list, tuple: list of extracted features and, depending on texts flag, the tokenized raw tweets ''' res = [] tweet_texts = [] # extract features if verbose: print('extracting features...') for is_sarcastic in [True, False]: if verbose: print(' preprocessing samples with sarcastic=' + str(is_sarcastic) + '...') # preprocess tweets if is_sarcastic: pipeline = Pipeline(corpus_file_pos, '../rsrc/de-tiger.map', verbose=verbose) else: pipeline = Pipeline(corpus_file_neg, '../rsrc/de-tiger.map', verbose=verbose) tweets_tkn, tweets_proc = pipeline.process() if verbose: print(' extracting features...') # extract features from tweets for tweet_index in range(len(tweets_tkn)): ext_features = self.extract_features_from_tweet( tweets_tkn[tweet_index], tweets_proc[tweet_index], is_sarcastic) res.append(ext_features) for text in tweets_tkn: tweet_texts.append(text) if self.texts: return res, tweet_texts else: return res
def preprocess(self): """ Preprocessing based on Scheffler et. al. German Twitter Preprocessing """ tokenizedTweets_writer = open('./daten/tokenized_tweets.txt', 'w') preprocTweets_writer = open('./daten/preprocessed_tweets.txt', 'w') pp = Pipeline(self.this_file, "./autosarkasmus/rsrc/de-tiger.map") tweets_tkn, tweets_proc, labels = pp.process() assert (len(tweets_tkn) == len(tweets_proc) == len(labels)) # write preprocessing results to file for x in range(len(tweets_proc)): t_tweet = (" ").join(tweets_tkn[x]) p_tweet = (" ").join( [str(x) + "/" + str(y) for x, y in tweets_proc[x]]) label = labels[x] tokenizedTweets_writer.write(t_tweet + "\t" + label + "\n") preprocTweets_writer.write(p_tweet + "\t" + label + "\n")
def preprocess(self): tokenizedTweets_writer = open( './daten/tokenized_tweets_normalized.txt', 'w') preprocTweets_writer = open( './daten/preprocessed_tweets_normalized.txt', 'w') pp = Pipeline(self.this_file, "./autosarkasmus/rsrc/de-tiger.map") tweets_tkn, tweets_proc, labels = pp.process() assert (len(tweets_tkn) == len(tweets_proc) == len(labels)) # filter stopwords + normalize tokens lemmatizer = IWNLPWrapper( lemmatizer_path='daten/IWNLP.Lemmatizer_20170501.json') lemmatized_tokens = [] for x in range(len(tweets_tkn)): tweet = [] for token in tweets_tkn[x]: if token.lower() in stopwords.words('german'): continue try: lemma = lemmatizer.lemmatize_plain(token, ignore_case=True) if (lemma): tweet.append(lemma[0]) else: tweet.append(token) except Exception as e: print(e) lemmatized_tokens.append(tweet) assert (len(lemmatized_tokens) == len(tweets_proc) == len(labels)) # write preprocessing results to file for x in range(len(lemmatized_tokens)): t_tweet = (" ").join(lemmatized_tokens[x]) p_tweet = (" ").join( [str(x) + "/" + str(y) for x, y in tweets_proc[x]]) label = labels[x] tokenizedTweets_writer.write(t_tweet + "\t" + label + "\n") preprocTweets_writer.write(p_tweet + "\t" + label + "\n")
print('setting up data...') data = [] if args.model == 'rnn': for is_sarcastic in [True, False]: print(' preprocessing samples with sarcastic=' + str(is_sarcastic) + '...') # preprocess tweets if is_sarcastic: pipeline = Pipeline(args.corpus_file_pos, '../rsrc/de-tiger.map', verbose=True) else: pipeline = Pipeline(args.corpus_file_neg, '../rsrc/de-tiger.map', verbose=True) tweets_tkn, tweets_proc = pipeline.process() for tweet_proc in tweets_proc: data.append({'tweet': tweet_proc, 'class': is_sarcastic}) if args.model in ['svm', 'mlp']: feature_extractor = FeatureExtractor(features, feature_order) data = feature_extractor.extract_features( args.corpus_file_pos, args.corpus_file_neg, verbose=True) # extract features from training corpora # classifier setup classifiers = [] if args.model == 'svm': classifiers.append({ 'name': 'svm_classifier',
# -*- coding: utf-8 -*- import sys import os.path sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) from autosarkasmus.preprocessor.pipeline import Pipeline pp = Pipeline("test.txt", "../rsrc/de-tiger.map") tweets, tagged = pp.process() for i in range(len(tweets)): print(" ".join(tweets[i])) output = "" for token, tag in tagged[i]: output += "{}|{} ".format(token, tag) print(output.strip()) print()