def get_unihash(corpusfile=UNIHASH_CORPUS_FILE):
    '''load stopwords'''
    stopwords = []
    if corpusfile:
        with open(UNIHASH_STOPWORDS_FILE, "r") as f:
            for line in f:
                line = line.strip()
                stopwords.append(line)

    # count
    unigram_counts = {}
    hashtag_counts = {}
    cr = CorpusReader(corpusfile)
    tkn = Tokenizer()
    for tweet in cr.text_txt():
        for token in tkn.tokenize(tweet):
            if token.startswith('#') and len(token) > 1:
                hashtag_counts.setdefault(token.lower(), 0)
                hashtag_counts[token.lower()] += 1
            else:
                # Filter stopwords
                if token.lower() not in stopwords and 'http' not in token and '@' not in token and len(token) > 1:
                    unigram_counts.setdefault(token.lower(), 0)
                    unigram_counts[token.lower()] += 1

    # sort and remove special words. Only select unigranms, which occur at least 3 times.
    frequent_unigrams = [item for item in sorted(list(unigram_counts.items()), key=lambda item : item[1], reverse=True) if item[1] > 2 and "esc" not in item[0] and "eurovision" not in item[0] and "bpw" not in item[0] and "euro2016" not in item[0]]
    # sort and remove special words. Only select hashtags, which occur at least 4 times.
    frequent_hashtags = [item for item in sorted(list(hashtag_counts.items()), key=lambda item : item[1], reverse=True) if item[1] > 3 and "esc" not in item[0] and "eurovision" not in item[0] and "bpw" not in item[0] and "euro2016" not in item[0] and "pokemongo" not in item[0] and "gerita"  not in item[0] and "brexit" not in item[0] and "gerfra" not in item[0] and "em2016" not in item[0]]
    return frequent_unigrams, frequent_hashtags
示例#2
0
    def process(self):
        '''
        Process the entire given corpus

        Returns:
            tuple: list of tokenized tweets, list of their normalized and tagged counterparts
        '''
        res_tkn = []
        res_proc = []
        corpus = CorpusReader(self.corpus_path)
        corpus_iter = corpus.text_json(
        ) if self.json_corpus else corpus.text_txt()  # check for corpus type
        for tweet_i, tweet_raw in enumerate(corpus_iter):
            if self.verbose:
                sys.stdout.write('\rtweet: %d of %d' %
                                 (tweet_i + 1, len(corpus_iter)))
                sys.stdout.flush()
            tweet_tkn, tweet_proc = self.process_tweet(tweet_raw)
            res_tkn.append(tweet_tkn)
            res_proc.append(tweet_proc)
        if self.verbose:
            sys.stdout.write('\rpreprocessing complete (%d tweets)' %
                             (len(corpus_iter)) +
                             (' ' * len(str(len(corpus_iter))) + '\n'))
        return res_tkn, res_proc, corpus.labels
示例#3
0
	def __init__(self, pos_set, neg_set, k, outputfile, features="full_featured"):
		'''
		Constructor of SVMClassifier
        	Keyword arguments:
            	pos_set (str): the path to the positive corpusfile
            	neg_set (str): the path to the negative corpusfile
            	k (int): defines n-fold crossvalidation
            	outputfile (string): filename to save the results in
            	features (string) default= 'full_featured': Defines the set of features to validate on ('full_featured', 'unigram_featured')
        '''
		self.pos_set = CorpusReader(pos_set).date_id_text()
		self.neg_set = CorpusReader(neg_set).date_id_text()
		self.k = k
		self.outputfile = outputfile
		self.features = features
示例#4
0
    def _initialize_normalizer(self):
        '''
        Initialization of Normalizer

        Since the normalizer requires training data, it is only initialized shortly before it is needed.
        This is only required once.
        '''
        normalizer = Normalizer()
        corpus = CorpusReader(self.corpus_path)
        corpus_iter = corpus.text_json(
        ) if self.json_corpus else corpus.text_txt()
        for tweet in corpus_iter:
            tweet_tkn = self.tokenize(tweet)
            data = normalizer.get_contexts(tweet_tkn)
            for (token, context) in data:
                normalizer.collect_bigrams(
                    token, context)  # train on token_bigrams in corpus
        self.normalizer = normalizer
示例#5
0
# -*- coding: utf-8 -*-
import sys
import os.path
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

from autosarkasmus.corpus.corpus_reader import CorpusReader
from autosarkasmus.preprocessor.tokenizer.tokenizer import Tokenizer

if __name__ == "__main__":
    cr = CorpusReader("test.txt")
    tkn = Tokenizer()
    for tweet in cr.text_txt():
        print(tweet)
        print("\t" + str(tkn.tokenize(tweet)) + "\n")
示例#6
0
# -*- coding: utf-8 -*-
import sys
import os.path
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

from autosarkasmus.corpus.corpus_reader import CorpusReader
from autosarkasmus.preprocessor.tokenizer.tokenizer import Tokenizer
from autosarkasmus.preprocessor.normalizer.normalizer import Normalizer

if __name__ == "__main__":
    corpus = CorpusReader("test.txt")
    tweets = corpus.text_txt()
    tokenizer = Tokenizer()
    normalizer = Normalizer()
    for tweet in tweets:
        # first: setup unigram&bigram counts
        tweet = tokenizer.tokenize(tweet)
        c = normalizer.get_contexts(tweet)
        for (token, context) in c:
            normalizer.collect_bigrams(token, context)

    for tweet in tweets:
        # second round: normalize
        tweet = tokenizer.tokenize(tweet)
        c = normalizer.get_contexts(tweet)
        for (token, context) in c:
            tn = normalizer.normalize(token, context)
            print("{} -> {}".format(token, tn))