def get_unihash(corpusfile=UNIHASH_CORPUS_FILE): '''load stopwords''' stopwords = [] if corpusfile: with open(UNIHASH_STOPWORDS_FILE, "r") as f: for line in f: line = line.strip() stopwords.append(line) # count unigram_counts = {} hashtag_counts = {} cr = CorpusReader(corpusfile) tkn = Tokenizer() for tweet in cr.text_txt(): for token in tkn.tokenize(tweet): if token.startswith('#') and len(token) > 1: hashtag_counts.setdefault(token.lower(), 0) hashtag_counts[token.lower()] += 1 else: # Filter stopwords if token.lower() not in stopwords and 'http' not in token and '@' not in token and len(token) > 1: unigram_counts.setdefault(token.lower(), 0) unigram_counts[token.lower()] += 1 # sort and remove special words. Only select unigranms, which occur at least 3 times. frequent_unigrams = [item for item in sorted(list(unigram_counts.items()), key=lambda item : item[1], reverse=True) if item[1] > 2 and "esc" not in item[0] and "eurovision" not in item[0] and "bpw" not in item[0] and "euro2016" not in item[0]] # sort and remove special words. Only select hashtags, which occur at least 4 times. frequent_hashtags = [item for item in sorted(list(hashtag_counts.items()), key=lambda item : item[1], reverse=True) if item[1] > 3 and "esc" not in item[0] and "eurovision" not in item[0] and "bpw" not in item[0] and "euro2016" not in item[0] and "pokemongo" not in item[0] and "gerita" not in item[0] and "brexit" not in item[0] and "gerfra" not in item[0] and "em2016" not in item[0]] return frequent_unigrams, frequent_hashtags
def process(self): ''' Process the entire given corpus Returns: tuple: list of tokenized tweets, list of their normalized and tagged counterparts ''' res_tkn = [] res_proc = [] corpus = CorpusReader(self.corpus_path) corpus_iter = corpus.text_json( ) if self.json_corpus else corpus.text_txt() # check for corpus type for tweet_i, tweet_raw in enumerate(corpus_iter): if self.verbose: sys.stdout.write('\rtweet: %d of %d' % (tweet_i + 1, len(corpus_iter))) sys.stdout.flush() tweet_tkn, tweet_proc = self.process_tweet(tweet_raw) res_tkn.append(tweet_tkn) res_proc.append(tweet_proc) if self.verbose: sys.stdout.write('\rpreprocessing complete (%d tweets)' % (len(corpus_iter)) + (' ' * len(str(len(corpus_iter))) + '\n')) return res_tkn, res_proc, corpus.labels
def __init__(self, pos_set, neg_set, k, outputfile, features="full_featured"): ''' Constructor of SVMClassifier Keyword arguments: pos_set (str): the path to the positive corpusfile neg_set (str): the path to the negative corpusfile k (int): defines n-fold crossvalidation outputfile (string): filename to save the results in features (string) default= 'full_featured': Defines the set of features to validate on ('full_featured', 'unigram_featured') ''' self.pos_set = CorpusReader(pos_set).date_id_text() self.neg_set = CorpusReader(neg_set).date_id_text() self.k = k self.outputfile = outputfile self.features = features
def _initialize_normalizer(self): ''' Initialization of Normalizer Since the normalizer requires training data, it is only initialized shortly before it is needed. This is only required once. ''' normalizer = Normalizer() corpus = CorpusReader(self.corpus_path) corpus_iter = corpus.text_json( ) if self.json_corpus else corpus.text_txt() for tweet in corpus_iter: tweet_tkn = self.tokenize(tweet) data = normalizer.get_contexts(tweet_tkn) for (token, context) in data: normalizer.collect_bigrams( token, context) # train on token_bigrams in corpus self.normalizer = normalizer
# -*- coding: utf-8 -*- import sys import os.path sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) from autosarkasmus.corpus.corpus_reader import CorpusReader from autosarkasmus.preprocessor.tokenizer.tokenizer import Tokenizer if __name__ == "__main__": cr = CorpusReader("test.txt") tkn = Tokenizer() for tweet in cr.text_txt(): print(tweet) print("\t" + str(tkn.tokenize(tweet)) + "\n")
# -*- coding: utf-8 -*- import sys import os.path sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) from autosarkasmus.corpus.corpus_reader import CorpusReader from autosarkasmus.preprocessor.tokenizer.tokenizer import Tokenizer from autosarkasmus.preprocessor.normalizer.normalizer import Normalizer if __name__ == "__main__": corpus = CorpusReader("test.txt") tweets = corpus.text_txt() tokenizer = Tokenizer() normalizer = Normalizer() for tweet in tweets: # first: setup unigram&bigram counts tweet = tokenizer.tokenize(tweet) c = normalizer.get_contexts(tweet) for (token, context) in c: normalizer.collect_bigrams(token, context) for tweet in tweets: # second round: normalize tweet = tokenizer.tokenize(tweet) c = normalizer.get_contexts(tweet) for (token, context) in c: tn = normalizer.normalize(token, context) print("{} -> {}".format(token, tn))