def load_token_list(term_file):
    '''
    load some stopword list from the corpus
    '''
    __location__ = '../corpora/'
    tokens = WordListCorpusReader(__location__, term_file)
    return [w.replace('+', '\+') for w in tokens.words()]
Пример #2
0
def load_token_list(term_file):
    '''
    load some stopword list from the corpus
    '''
    __location__ = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                _corpus_root)
    tokens = WordListCorpusReader(__location__, term_file)
    return [w.replace('+', '\+') for w in tokens.words()]
def load_token_list(term_file):
    '''
    load some stopword list from the corpus
    '''
    __location__ = os.path.join(
        os.path.abspath(os.path.dirname(__file__)), _corpus_root)
    tokens = WordListCorpusReader(__location__, term_file)
    return [w.replace('+', '\+') for w in tokens.words()]
Пример #4
0
def extract_mimetypes(text, do_replace=True):
    '''
    pull a list of mimetypes from some text feature

    return a list of mimetypes in the text block and
    the text, without mimetypes or unmodified
    '''
    mimetypes = WordListCorpusReader(_corpus_root, 'mimetypes.txt')

    found_mimetypes = [w for w in mimetypes.words() if w in text]

    if do_replace:
        text = remove_tokens('mimetypes.txt', text)

    return found_mimetypes, text
Пример #5
0
 def __init_corpora(self):
     self.negation_words = WordListCorpusReader('../data/corpora/',
                                                'negation_words')
     self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends')
     self.negative_sentiments = WordListCorpusReader(
         '../data/corpora/sentiment-lexicon', 'negative-words.txt')
     self.positive_sentiments = WordListCorpusReader(
         '../data/corpora/sentiment-lexicon', 'positive-words.txt')
def extract_mimetypes(text, do_replace=True):
    '''
    pull a list of mimetypes from some text feature

    return a list of mimetypes in the text block and
    the text, without mimetypes or unmodified
    '''
    mimetypes = WordListCorpusReader(_corpus_root, 'mimetypes.txt')

    found_mimetypes = [w for w in mimetypes.words() if w in text]

    if do_replace:
        text = remove_tokens('mimetypes.txt', text)

    return found_mimetypes, text
Пример #7
0
    def __init__(self, language, sw_files=[], load_default=True):
        self.language = language
        self.stopwords = []

        if load_default:
            wlcr = WordListCorpusReader(data.GzipFileSystemPathPointer(DEFAULT_SW_FILE), [language], encoding="utf-8")
            self.stopwords = wlcr.words(language)
            logging.info("Loaded default stopwords from file %s" % DEFAULT_SW_FILE)

        path = BASE_SW_PATH + language
        for sw_file in sw_files:
            wlcr = WordListCorpusReader(data.FileSystemPathPointer(path), sw_file, encoding="utf-8")
            self.stopwords += wlcr.words(sw_file)
            logging.info("Loaded stopwords from file '%s'" % sw_file)
Пример #8
0
@author: ake
@software: PyCharm Community Edition
@time: 2016/4/28 9:32
"""
import re
from gensim import corpora, models, similarities
import xml.etree.ElementTree as Et
from GetData.preprocess import getdata, ltp, ltp_pos
from nltk.corpus import WordListCorpusReader
import jieba
import jieba.posseg as pseg
import logging

# 添加停用词
STOP_PATH = r'D:\MyProject\pythonProjects\TopicMine\LDA_T\data\\'
stopwords = set(WordListCorpusReader(STOP_PATH, 'stopwords.txt').words())


#
def parse_lda_xml(file):
    with open(file, 'r', encoding='utf-8') as f:
        xml_raw = f.read().strip().split('\n\n')  # xml文本,可能包含多个xml,用双换行进行切分
    f.close()
    docs = []  # 存储结果
    for doc in xml_raw:
        xml = Et.fromstring(doc)
        doc_words = []
        for sentence in xml.findall('./doc/para/sent'):  # 循环读取句子
            word_list = [words for words in sentence]  # 循环读取word列表
            wordsall = []  # 存储句子的匹配结果
            for word in word_list:  # 循环解析每个word要素
Пример #9
0
def read_stopwords(path):
    '''使用nltk读停用词表
    '''
    root,fileid=os.path.split(path)
    stopwords=WordListCorpusReader(root,[fileid])
    return stopwords.words(fileid)
class OpinionSentenceFinder:
	def __init__(self, features, feature_sentences):
		self.feature_sentences = feature_sentences
		self.opinion_sentences = []
		self.features = features
		self.__init_corpora()
		for sent_index in xrange(len(self.feature_sentences)):
			sent = self.feature_sentences[sent_index]
			self.feature_sentences[sent_index]['opinion_sent'] = []
			for feature in self.features:
				feature = feature[0]
				if feature in sent['nouns'] or feature in sent['noun_phrases']:
					for index in xrange(len(sent['tags'])):
						(w, t) = sent['tags'][index]
						if w.find(feature.split()[0]) > -1:
							JJ = self.get_nearest_JJ(sent['tags'], index)
							self.feature_sentences[sent_index]['opinion_sent'].append((feature, JJ))
							self.opinion_sentences.append((feature, JJ))
		
	def __init_corpora(self):
		self.negation_words = WordListCorpusReader('../data/corpora/', 'negation_words')
		self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends')
		self.negative_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'negative-words.txt')
		self.positive_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'positive-words.txt')

					
	def remove_uncertain_features(self):
		None
	"""
		Todo: concat consecutive JJ's (Opt.) 
		      Remove meaningless JJ's (95% done.)
		      Implement lemmatizing while checking JJ's
		      Stop scanning for JJ's, after the period or ',' or other sentence ends (done.)
		      Negation of opinions. (done.)
		      (Opt.) Append (RR, RB) to the JJ
		      Special treatment for NOUNS in pros
			Fix neg bug
	"""
	def get_nearest_JJ(self, tags, n_index):
		adj = ''
		neg = ''
		sentiment = None
		for i in xrange(n_index + 1, len(tags)):
			(w, t) = tags[i]
			if w in self.sent_ends.words():
				break
			if w in self.negation_words.words():
				neg = w
			if t in ['JJ', 'JJR', 'JJS']:
				adj = w
			if unicode.encode(w) in self.negative_sentiments.words():
				adj = w
				sentiment = False
			if unicode.encode(w) in self.positive_sentiments.words():
				adj = w
				sentiment = True
				break
		start = n_index
		if len(adj) < 1:
			end = -1
			neg = ''
		else:
			end = n_index - (i - n_index) - 1
		for j in xrange(start, end, -1):
			(w, t) = tags[j] 
			if w in self.sent_ends.words():
				break
			if w in self.negation_words.words():
				neg = w
			if t in ['JJ', 'JJR', 'JJS']:
				adj = w
			if unicode.encode(w) in self.negative_sentiments.words():
				adj = w
				sentiment = False
			if unicode.encode(w) in self.positive_sentiments.words():
				adj = w
				sentiment = True
				break
		if len(neg) > 1:
			sentiment = not sentiment
		return (sentiment, neg, adj) 
	def __init_corpora(self):
		self.negation_words = WordListCorpusReader('../data/corpora/', 'negation_words')
		self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends')
		self.negative_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'negative-words.txt')
		self.positive_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'positive-words.txt')
 def init_corpus(self):
     self.negation_words = WordListCorpusReader('../data/corpus/', 'negation-words.txt')
     self.negative_sentiments = WordListCorpusReader('../data/corpus/', 'negative-words.txt')
     self.positive_sentiments = WordListCorpusReader('../data/corpus/', 'positive-words.txt')
class OpinionSentenceCollector:
    def __init__(self, features, feature_sentences):
        self.features = features
        self.feature_sentences = feature_sentences
        self.opinion_sentences = []
        self.opinion_features = []

        self.init_corpus()

        for sentence_index in xrange(len(self.feature_sentences)):
            sentence = self.feature_sentences[sentence_index]
            self.feature_sentences[sentence_index]['opinion_sentence'] = []
            for feature in self.features:
                #Extracting the feature from (feature, count) tuple
                feature = feature[0]
                if feature in sentence['nouns'] or feature in sentence['noun_phrases']:
                    for tag_index in xrange(len(sentence['tags'])):
                        (word, tag) = sentence['tags'][tag_index]
                        if(word.find(feature.split()[0])) > -1:
                            (sentiment_score, opinion) = self.calculate_sent_score(sentence['tags'], tag_index)
                            if len(opinion) > 0:
                                self.opinion_features.append(feature)
                                self.opinion_sentences.append((feature, sentiment_score, sentence['sentence']))

    def init_corpus(self):
        self.negation_words = WordListCorpusReader('../data/corpus/', 'negation-words.txt')
        self.negative_sentiments = WordListCorpusReader('../data/corpus/', 'negative-words.txt')
        self.positive_sentiments = WordListCorpusReader('../data/corpus/', 'positive-words.txt')


    def calculate_sent_score(self, tags, tag_index):

        positive_sentiment_score = 0
        negative_sentiment_score = 0
        adjective = ''
        negation_words = ''

        for i in xrange(tag_index + 1, len(tags)):
            (word, tag) = tags[i]
            if word in self.negation_words.words():
                negation_words = word
            if tag in ['JJ', 'JJR', 'JJS']:
                adjective = word
                if word in self.negative_sentiments.words():
                    adjective = word
                    if not len(negation_words) > 0:
                        negative_sentiment_score += 1
                    else:
                        positive_sentiment_score += 1
                if word in self.positive_sentiments.words():
                    adjective = word
                    if not len(negation_words) > 0:
                        positive_sentiment_score += 1
                    else:
                        negative_sentiment_score += 1

        start = 0
        negation_words = ''

        for j in xrange(start, tag_index):
            (word, tag) = tags[j]
            if word in self.negation_words.words():
                negation_words = word
            if tag in ['JJ', 'JJR', 'JJS']:
                adjective = word
                if word in self.negative_sentiments.words():
                    adjective = word
                    if not len(negation_words) > 0:
                        negative_sentiment_score += 1
                    else:
                        positive_sentiment_score += 1
                if word in self.positive_sentiments.words():
                    if not len(negation_words) > 0:
                        positive_sentiment_score += 1
                    else:
                        negative_sentiment_score += 1

        final_score = positive_sentiment_score - negative_sentiment_score

        #print "Sentiment Score", final_score, adjective
        return final_score, adjective
Пример #14
0
delete = (config.has_option('cooking', 'delete')
          and config.get('cooking', 'delete').split(REGEX_SEPARATOR)) or ''
keywords = (config.has_option('cooking', 'keywords') and config.get(
    'cooking', 'keywords').lower().split(',')) or 'movistar'
oficial_users = (config.has_option('cooking', 'oficial_users') and config.get(
    'cooking', 'oficial_users').lower().split(',')) or 'movistar'
languages = (config.has_option('cooking', 'languages') and config.get(
    'cooking', 'languages').lower().split(',')) or 'spanish'
steps = config.get('cooking', 'steps').lower().split(',')
text_field_out = (config.has_option('cooking', 'text_field_out')
                  and config.get('cooking', 'text_field_out').lower()) or ''
text_field_in = (config.has_option('cooking', 'text_field_in')
                 and config.get('cooking', 'text_field_in').lower()) or ''

# Loading serving section
output_fields = (config.has_option('serving', 'output_fields') and config.get(
    'serving', 'output_fields').lower().split(',')) or ''
output_separator = (config.has_option('serving', 'output_separator')
                    and config.get('serving', 'output_separator')) or ','

# Reading corpus
data.path = [os.getcwd()] + data.path
stopwords = WordListCorpusReader(
    data.GzipFileSystemPathPointer('stopwords.zip'), languages)

# Loading dependencies
yamlImport = zipimport.zipimporter('yaml.zip')
yalm = yamlImport.load_module('yaml')
nltkImport = zipimport.zipimporter('nltk.zip')
nltk = nltkImport.load_module('nltk')
Пример #15
0
from .VerbValencyReader import VerbValencyReader
from .DadeganReader import DadeganReader
from .TreebankReader import TreebankReader
from .WikipediaReader import WikipediaReader
from .SentiPersReader import SentiPersReader
from .TNewsReader import TNewsReader
from .Normalizer import Normalizer
from .InformalNormalizer import InformalNormalizer, InformalLemmatizer
from .Stemmer import Stemmer
from .Lemmatizer import Lemmatizer
from .SequenceTagger import SequenceTagger, IOBTagger
from .POSTagger import POSTagger, StanfordPOSTagger
from .Chunker import Chunker, RuleBasedChunker, tree2brackets
from .DependencyParser import DependencyParser, MaltParser, TurboParser

from .utils import default_stopwords
from nltk.corpus import WordListCorpusReader
stopwords = WordListCorpusReader('', [default_stopwords], encoding='utf8')


def sent_tokenize(text):
    if not hasattr(sent_tokenize, 'tokenizer'):
        sent_tokenize.tokenizer = SentenceTokenizer()
    return sent_tokenize.tokenizer.tokenize(text)


def word_tokenize(sentence):
    if not hasattr(word_tokenize, 'tokenizer'):
        word_tokenize.tokenizer = WordTokenizer()
    return word_tokenize.tokenizer.tokenize(sentence)
Пример #16
0
class OpinionSentenceFinder:
    def __init__(self, features, feature_sentences):
        self.feature_sentences = feature_sentences
        self.opinion_sentences = []
        self.features = features
        self.__init_corpora()
        for sent_index in xrange(len(self.feature_sentences)):
            sent = self.feature_sentences[sent_index]
            self.feature_sentences[sent_index]['opinion_sent'] = []
            for feature in self.features:
                feature = feature[0]
                if feature in sent['nouns'] or feature in sent['noun_phrases']:
                    for index in xrange(len(sent['tags'])):
                        (w, t) = sent['tags'][index]
                        if w.find(feature.split()[0]) > -1:
                            JJ = self.get_nearest_JJ(sent['tags'], index)
                            self.feature_sentences[sent_index][
                                'opinion_sent'].append((feature, JJ))
                            self.opinion_sentences.append((feature, JJ))

    def __init_corpora(self):
        self.negation_words = WordListCorpusReader('../data/corpora/',
                                                   'negation_words')
        self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends')
        self.negative_sentiments = WordListCorpusReader(
            '../data/corpora/sentiment-lexicon', 'negative-words.txt')
        self.positive_sentiments = WordListCorpusReader(
            '../data/corpora/sentiment-lexicon', 'positive-words.txt')

    def remove_uncertain_features(self):
        None

    """
		Todo: concat consecutive JJ's (Opt.) 
		      Remove meaningless JJ's (95% done.)
		      Implement lemmatizing while checking JJ's
		      Stop scanning for JJ's, after the period or ',' or other sentence ends (done.)
		      Negation of opinions. (done.)
		      (Opt.) Append (RR, RB) to the JJ
		      Special treatment for NOUNS in pros
			Fix neg bug
	"""

    def get_nearest_JJ(self, tags, n_index):
        adj = ''
        neg = ''
        sentiment = None
        for i in xrange(n_index + 1, len(tags)):
            (w, t) = tags[i]
            if w in self.sent_ends.words():
                break
            if w in self.negation_words.words():
                neg = w
            if t in ['JJ', 'JJR', 'JJS']:
                adj = w
            if unicode.encode(w) in self.negative_sentiments.words():
                adj = w
                sentiment = False
            if unicode.encode(w) in self.positive_sentiments.words():
                adj = w
                sentiment = True
                break
        start = n_index
        if len(adj) < 1:
            end = -1
            neg = ''
        else:
            end = n_index - (i - n_index) - 1
        for j in xrange(start, end, -1):
            (w, t) = tags[j]
            if w in self.sent_ends.words():
                break
            if w in self.negation_words.words():
                neg = w
            if t in ['JJ', 'JJR', 'JJS']:
                adj = w
            if unicode.encode(w) in self.negative_sentiments.words():
                adj = w
                sentiment = False
            if unicode.encode(w) in self.positive_sentiments.words():
                adj = w
                sentiment = True
                break
        if len(neg) > 1:
            sentiment = not sentiment
        return (sentiment, neg, adj)