Python CorpusReader示例

编程语言: Python

命名空间/包名称: autosarkasmus.corpus.corpus_reader

类/类型: CorpusReader

hotexamples.com的示例: 6

Python CorpusReader - 已找到6个示例。这些是从开源项目中提取的最受好评的autosarkasmus.corpus.corpus_reader.CorpusReader现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

CorpusReader(5)

text_txt(5)

text_json(2)

示例#1

显示文件

文件： unihash_helpers.py 项目： personads/autosarkasmus

def get_unihash(corpusfile=UNIHASH_CORPUS_FILE):
    '''load stopwords'''
    stopwords = []
    if corpusfile:
        with open(UNIHASH_STOPWORDS_FILE, "r") as f:
            for line in f:
                line = line.strip()
                stopwords.append(line)

    # count
    unigram_counts = {}
    hashtag_counts = {}
    cr = CorpusReader(corpusfile)
    tkn = Tokenizer()
    for tweet in cr.text_txt():
        for token in tkn.tokenize(tweet):
            if token.startswith('#') and len(token) > 1:
                hashtag_counts.setdefault(token.lower(), 0)
                hashtag_counts[token.lower()] += 1
            else:
                # Filter stopwords
                if token.lower() not in stopwords and 'http' not in token and '@' not in token and len(token) > 1:
                    unigram_counts.setdefault(token.lower(), 0)
                    unigram_counts[token.lower()] += 1

    # sort and remove special words. Only select unigranms, which occur at least 3 times.
    frequent_unigrams = [item for item in sorted(list(unigram_counts.items()), key=lambda item : item[1], reverse=True) if item[1] > 2 and "esc" not in item[0] and "eurovision" not in item[0] and "bpw" not in item[0] and "euro2016" not in item[0]]
    # sort and remove special words. Only select hashtags, which occur at least 4 times.
    frequent_hashtags = [item for item in sorted(list(hashtag_counts.items()), key=lambda item : item[1], reverse=True) if item[1] > 3 and "esc" not in item[0] and "eurovision" not in item[0] and "bpw" not in item[0] and "euro2016" not in item[0] and "pokemongo" not in item[0] and "gerita"  not in item[0] and "brexit" not in item[0] and "gerfra" not in item[0] and "em2016" not in item[0]]
    return frequent_unigrams, frequent_hashtags

示例#2

显示文件

    def process(self):
        '''
        Process the entire given corpus

        Returns:
            tuple: list of tokenized tweets, list of their normalized and tagged counterparts
        '''
        res_tkn = []
        res_proc = []
        corpus = CorpusReader(self.corpus_path)
        corpus_iter = corpus.text_json(
        ) if self.json_corpus else corpus.text_txt()  # check for corpus type
        for tweet_i, tweet_raw in enumerate(corpus_iter):
            if self.verbose:
                sys.stdout.write('\rtweet: %d of %d' %
                                 (tweet_i + 1, len(corpus_iter)))
                sys.stdout.flush()
            tweet_tkn, tweet_proc = self.process_tweet(tweet_raw)
            res_tkn.append(tweet_tkn)
            res_proc.append(tweet_proc)
        if self.verbose:
            sys.stdout.write('\rpreprocessing complete (%d tweets)' %
                             (len(corpus_iter)) +
                             (' ' * len(str(len(corpus_iter))) + '\n'))
        return res_tkn, res_proc, corpus.labels

示例#3

显示文件

	def __init__(self, pos_set, neg_set, k, outputfile, features="full_featured"):
		'''
		Constructor of SVMClassifier
        	Keyword arguments:
            	pos_set (str): the path to the positive corpusfile
            	neg_set (str): the path to the negative corpusfile
            	k (int): defines n-fold crossvalidation
            	outputfile (string): filename to save the results in
            	features (string) default= 'full_featured': Defines the set of features to validate on ('full_featured', 'unigram_featured')
        '''
		self.pos_set = CorpusReader(pos_set).date_id_text()
		self.neg_set = CorpusReader(neg_set).date_id_text()
		self.k = k
		self.outputfile = outputfile
		self.features = features

示例#4

显示文件

    def _initialize_normalizer(self):
        '''
        Initialization of Normalizer

        Since the normalizer requires training data, it is only initialized shortly before it is needed.
        This is only required once.
        '''
        normalizer = Normalizer()
        corpus = CorpusReader(self.corpus_path)
        corpus_iter = corpus.text_json(
        ) if self.json_corpus else corpus.text_txt()
        for tweet in corpus_iter:
            tweet_tkn = self.tokenize(tweet)
            data = normalizer.get_contexts(tweet_tkn)
            for (token, context) in data:
                normalizer.collect_bigrams(
                    token, context)  # train on token_bigrams in corpus
        self.normalizer = normalizer

示例#5

显示文件

# -*- coding: utf-8 -*-
import sys
import os.path
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

from autosarkasmus.corpus.corpus_reader import CorpusReader
from autosarkasmus.preprocessor.tokenizer.tokenizer import Tokenizer

if __name__ == "__main__":
    cr = CorpusReader("test.txt")
    tkn = Tokenizer()
    for tweet in cr.text_txt():
        print(tweet)
        print("\t" + str(tkn.tokenize(tweet)) + "\n")

示例#6

显示文件

# -*- coding: utf-8 -*-
import sys
import os.path
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

from autosarkasmus.corpus.corpus_reader import CorpusReader
from autosarkasmus.preprocessor.tokenizer.tokenizer import Tokenizer
from autosarkasmus.preprocessor.normalizer.normalizer import Normalizer

if __name__ == "__main__":
    corpus = CorpusReader("test.txt")
    tweets = corpus.text_txt()
    tokenizer = Tokenizer()
    normalizer = Normalizer()
    for tweet in tweets:
        # first: setup unigram&bigram counts
        tweet = tokenizer.tokenize(tweet)
        c = normalizer.get_contexts(tweet)
        for (token, context) in c:
            normalizer.collect_bigrams(token, context)

    for tweet in tweets:
        # second round: normalize
        tweet = tokenizer.tokenize(tweet)
        c = normalizer.get_contexts(tweet)
        for (token, context) in c:
            tn = normalizer.normalize(token, context)
            print("{} -> {}".format(token, tn))