Python WordListCorpusReader примеры, nltk.corpus.WordListCorpusReader Python примеры использования

Пример #1

0

Показать файл

Файл: load_word_counts.py Проект: b-cube/Response-Identification-Info

def load_token_list(term_file):
    '''
    load some stopword list from the corpus
    '''
    __location__ = '../corpora/'
    tokens = WordListCorpusReader(__location__, term_file)
    return [w.replace('+', '\+') for w in tokens.words()]

Пример #2

0

Показать файл

def load_token_list(term_file):
    '''
    load some stopword list from the corpus
    '''
    __location__ = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                _corpus_root)
    tokens = WordListCorpusReader(__location__, term_file)
    return [w.replace('+', '\+') for w in tokens.words()]

Пример #3

0

Показать файл

Файл: nlp_utils.py Проект: Sandy4321/semantics-preprocessing

def load_token_list(term_file):
    '''
    load some stopword list from the corpus
    '''
    __location__ = os.path.join(
        os.path.abspath(os.path.dirname(__file__)), _corpus_root)
    tokens = WordListCorpusReader(__location__, term_file)
    return [w.replace('+', '\+') for w in tokens.words()]

Пример #4

0

Показать файл

def extract_mimetypes(text, do_replace=True):
    '''
    pull a list of mimetypes from some text feature

    return a list of mimetypes in the text block and
    the text, without mimetypes or unmodified
    '''
    mimetypes = WordListCorpusReader(_corpus_root, 'mimetypes.txt')

    found_mimetypes = [w for w in mimetypes.words() if w in text]

    if do_replace:
        text = remove_tokens('mimetypes.txt', text)

    return found_mimetypes, text

Пример #5

0

Показать файл

Файл: OpinionSentenceFinder.py Проект: whoishu/OpinionMiner

 def __init_corpora(self):
     self.negation_words = WordListCorpusReader('../data/corpora/',
                                                'negation_words')
     self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends')
     self.negative_sentiments = WordListCorpusReader(
         '../data/corpora/sentiment-lexicon', 'negative-words.txt')
     self.positive_sentiments = WordListCorpusReader(
         '../data/corpora/sentiment-lexicon', 'positive-words.txt')

Пример #6

0

Показать файл

Файл: nlp_utils.py Проект: Sandy4321/semantics-preprocessing

def extract_mimetypes(text, do_replace=True):
    '''
    pull a list of mimetypes from some text feature

    return a list of mimetypes in the text block and
    the text, without mimetypes or unmodified
    '''
    mimetypes = WordListCorpusReader(_corpus_root, 'mimetypes.txt')

    found_mimetypes = [w for w in mimetypes.words() if w in text]

    if do_replace:
        text = remove_tokens('mimetypes.txt', text)

    return found_mimetypes, text

Пример #7

0

Показать файл

    def __init__(self, language, sw_files=[], load_default=True):
        self.language = language
        self.stopwords = []

        if load_default:
            wlcr = WordListCorpusReader(data.GzipFileSystemPathPointer(DEFAULT_SW_FILE), [language], encoding="utf-8")
            self.stopwords = wlcr.words(language)
            logging.info("Loaded default stopwords from file %s" % DEFAULT_SW_FILE)

        path = BASE_SW_PATH + language
        for sw_file in sw_files:
            wlcr = WordListCorpusReader(data.FileSystemPathPointer(path), sw_file, encoding="utf-8")
            self.stopwords += wlcr.words(sw_file)
            logging.info("Loaded stopwords from file '%s'" % sw_file)

Пример #8

0

Показать файл

@author: ake
@software: PyCharm Community Edition
@time: 2016/4/28 9:32
"""
import re
from gensim import corpora, models, similarities
import xml.etree.ElementTree as Et
from GetData.preprocess import getdata, ltp, ltp_pos
from nltk.corpus import WordListCorpusReader
import jieba
import jieba.posseg as pseg
import logging

# 添加停用词
STOP_PATH = r'D:\MyProject\pythonProjects\TopicMine\LDA_T\data\\'
stopwords = set(WordListCorpusReader(STOP_PATH, 'stopwords.txt').words())


#
def parse_lda_xml(file):
    with open(file, 'r', encoding='utf-8') as f:
        xml_raw = f.read().strip().split('\n\n')  # xml文本，可能包含多个xml，用双换行进行切分
    f.close()
    docs = []  # 存储结果
    for doc in xml_raw:
        xml = Et.fromstring(doc)
        doc_words = []
        for sentence in xml.findall('./doc/para/sent'):  # 循环读取句子
            word_list = [words for words in sentence]  # 循环读取word列表
            wordsall = []  # 存储句子的匹配结果
            for word in word_list:  # 循环解析每个word要素

Пример #9

0

Показать файл

Файл: nlp.py Проект: tangxiangru/Factoid-QA-subtask

def read_stopwords(path):
    '''使用nltk读停用词表
    '''
    root,fileid=os.path.split(path)
    stopwords=WordListCorpusReader(root,[fileid])
    return stopwords.words(fileid)

Пример #10

0

Показать файл

Файл: OpinionSentenceFinder.py Проект: SimonAtGitHub/OpinionMiner

class OpinionSentenceFinder:
	def __init__(self, features, feature_sentences):
		self.feature_sentences = feature_sentences
		self.opinion_sentences = []
		self.features = features
		self.__init_corpora()
		for sent_index in xrange(len(self.feature_sentences)):
			sent = self.feature_sentences[sent_index]
			self.feature_sentences[sent_index]['opinion_sent'] = []
			for feature in self.features:
				feature = feature[0]
				if feature in sent['nouns'] or feature in sent['noun_phrases']:
					for index in xrange(len(sent['tags'])):
						(w, t) = sent['tags'][index]
						if w.find(feature.split()[0]) > -1:
							JJ = self.get_nearest_JJ(sent['tags'], index)
							self.feature_sentences[sent_index]['opinion_sent'].append((feature, JJ))
							self.opinion_sentences.append((feature, JJ))
		
	def __init_corpora(self):
		self.negation_words = WordListCorpusReader('../data/corpora/', 'negation_words')
		self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends')
		self.negative_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'negative-words.txt')
		self.positive_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'positive-words.txt')

					
	def remove_uncertain_features(self):
		None
	"""
		Todo: concat consecutive JJ's (Opt.) 
		      Remove meaningless JJ's (95% done.)
		      Implement lemmatizing while checking JJ's
		      Stop scanning for JJ's, after the period or ',' or other sentence ends (done.)
		      Negation of opinions. (done.)
		      (Opt.) Append (RR, RB) to the JJ
		      Special treatment for NOUNS in pros
			Fix neg bug
	"""
	def get_nearest_JJ(self, tags, n_index):
		adj = ''
		neg = ''
		sentiment = None
		for i in xrange(n_index + 1, len(tags)):
			(w, t) = tags[i]
			if w in self.sent_ends.words():
				break
			if w in self.negation_words.words():
				neg = w
			if t in ['JJ', 'JJR', 'JJS']:
				adj = w
			if unicode.encode(w) in self.negative_sentiments.words():
				adj = w
				sentiment = False
			if unicode.encode(w) in self.positive_sentiments.words():
				adj = w
				sentiment = True
				break
		start = n_index
		if len(adj) < 1:
			end = -1
			neg = ''
		else:
			end = n_index - (i - n_index) - 1
		for j in xrange(start, end, -1):
			(w, t) = tags[j] 
			if w in self.sent_ends.words():
				break
			if w in self.negation_words.words():
				neg = w
			if t in ['JJ', 'JJR', 'JJS']:
				adj = w
			if unicode.encode(w) in self.negative_sentiments.words():
				adj = w
				sentiment = False
			if unicode.encode(w) in self.positive_sentiments.words():
				adj = w
				sentiment = True
				break
		if len(neg) > 1:
			sentiment = not sentiment
		return (sentiment, neg, adj)

Пример #11

0

Показать файл

Файл: OpinionSentenceFinder.py Проект: SimonAtGitHub/OpinionMiner

	def __init_corpora(self):
		self.negation_words = WordListCorpusReader('../data/corpora/', 'negation_words')
		self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends')
		self.negative_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'negative-words.txt')
		self.positive_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'positive-words.txt')

Пример #12

0

Показать файл

Файл: OpinionSentenceCollector.py Проект: sgudla/OpninionMining

 def init_corpus(self):
     self.negation_words = WordListCorpusReader('../data/corpus/', 'negation-words.txt')
     self.negative_sentiments = WordListCorpusReader('../data/corpus/', 'negative-words.txt')
     self.positive_sentiments = WordListCorpusReader('../data/corpus/', 'positive-words.txt')

Пример #13

0

Показать файл

Файл: OpinionSentenceCollector.py Проект: sgudla/OpninionMining

class OpinionSentenceCollector:
    def __init__(self, features, feature_sentences):
        self.features = features
        self.feature_sentences = feature_sentences
        self.opinion_sentences = []
        self.opinion_features = []

        self.init_corpus()

        for sentence_index in xrange(len(self.feature_sentences)):
            sentence = self.feature_sentences[sentence_index]
            self.feature_sentences[sentence_index]['opinion_sentence'] = []
            for feature in self.features:
                #Extracting the feature from (feature, count) tuple
                feature = feature[0]
                if feature in sentence['nouns'] or feature in sentence['noun_phrases']:
                    for tag_index in xrange(len(sentence['tags'])):
                        (word, tag) = sentence['tags'][tag_index]
                        if(word.find(feature.split()[0])) > -1:
                            (sentiment_score, opinion) = self.calculate_sent_score(sentence['tags'], tag_index)
                            if len(opinion) > 0:
                                self.opinion_features.append(feature)
                                self.opinion_sentences.append((feature, sentiment_score, sentence['sentence']))

    def init_corpus(self):
        self.negation_words = WordListCorpusReader('../data/corpus/', 'negation-words.txt')
        self.negative_sentiments = WordListCorpusReader('../data/corpus/', 'negative-words.txt')
        self.positive_sentiments = WordListCorpusReader('../data/corpus/', 'positive-words.txt')


    def calculate_sent_score(self, tags, tag_index):

        positive_sentiment_score = 0
        negative_sentiment_score = 0
        adjective = ''
        negation_words = ''

        for i in xrange(tag_index + 1, len(tags)):
            (word, tag) = tags[i]
            if word in self.negation_words.words():
                negation_words = word
            if tag in ['JJ', 'JJR', 'JJS']:
                adjective = word
                if word in self.negative_sentiments.words():
                    adjective = word
                    if not len(negation_words) > 0:
                        negative_sentiment_score += 1
                    else:
                        positive_sentiment_score += 1
                if word in self.positive_sentiments.words():
                    adjective = word
                    if not len(negation_words) > 0:
                        positive_sentiment_score += 1
                    else:
                        negative_sentiment_score += 1

        start = 0
        negation_words = ''

        for j in xrange(start, tag_index):
            (word, tag) = tags[j]
            if word in self.negation_words.words():
                negation_words = word
            if tag in ['JJ', 'JJR', 'JJS']:
                adjective = word
                if word in self.negative_sentiments.words():
                    adjective = word
                    if not len(negation_words) > 0:
                        negative_sentiment_score += 1
                    else:
                        positive_sentiment_score += 1
                if word in self.positive_sentiments.words():
                    if not len(negation_words) > 0:
                        positive_sentiment_score += 1
                    else:
                        negative_sentiment_score += 1

        final_score = positive_sentiment_score - negative_sentiment_score

        #print "Sentiment Score", final_score, adjective
        return final_score, adjective

Пример #14

0

Показать файл

delete = (config.has_option('cooking', 'delete')
          and config.get('cooking', 'delete').split(REGEX_SEPARATOR)) or ''
keywords = (config.has_option('cooking', 'keywords') and config.get(
    'cooking', 'keywords').lower().split(',')) or 'movistar'
oficial_users = (config.has_option('cooking', 'oficial_users') and config.get(
    'cooking', 'oficial_users').lower().split(',')) or 'movistar'
languages = (config.has_option('cooking', 'languages') and config.get(
    'cooking', 'languages').lower().split(',')) or 'spanish'
steps = config.get('cooking', 'steps').lower().split(',')
text_field_out = (config.has_option('cooking', 'text_field_out')
                  and config.get('cooking', 'text_field_out').lower()) or ''
text_field_in = (config.has_option('cooking', 'text_field_in')
                 and config.get('cooking', 'text_field_in').lower()) or ''

# Loading serving section
output_fields = (config.has_option('serving', 'output_fields') and config.get(
    'serving', 'output_fields').lower().split(',')) or ''
output_separator = (config.has_option('serving', 'output_separator')
                    and config.get('serving', 'output_separator')) or ','

# Reading corpus
data.path = [os.getcwd()] + data.path
stopwords = WordListCorpusReader(
    data.GzipFileSystemPathPointer('stopwords.zip'), languages)

# Loading dependencies
yamlImport = zipimport.zipimporter('yaml.zip')
yalm = yamlImport.load_module('yaml')
nltkImport = zipimport.zipimporter('nltk.zip')
nltk = nltkImport.load_module('nltk')

Пример #15

0

Показать файл

from .VerbValencyReader import VerbValencyReader
from .DadeganReader import DadeganReader
from .TreebankReader import TreebankReader
from .WikipediaReader import WikipediaReader
from .SentiPersReader import SentiPersReader
from .TNewsReader import TNewsReader
from .Normalizer import Normalizer
from .InformalNormalizer import InformalNormalizer, InformalLemmatizer
from .Stemmer import Stemmer
from .Lemmatizer import Lemmatizer
from .SequenceTagger import SequenceTagger, IOBTagger
from .POSTagger import POSTagger, StanfordPOSTagger
from .Chunker import Chunker, RuleBasedChunker, tree2brackets
from .DependencyParser import DependencyParser, MaltParser, TurboParser

from .utils import default_stopwords
from nltk.corpus import WordListCorpusReader
stopwords = WordListCorpusReader('', [default_stopwords], encoding='utf8')


def sent_tokenize(text):
    if not hasattr(sent_tokenize, 'tokenizer'):
        sent_tokenize.tokenizer = SentenceTokenizer()
    return sent_tokenize.tokenizer.tokenize(text)


def word_tokenize(sentence):
    if not hasattr(word_tokenize, 'tokenizer'):
        word_tokenize.tokenizer = WordTokenizer()
    return word_tokenize.tokenizer.tokenize(sentence)

Пример #16

0

Показать файл

Файл: OpinionSentenceFinder.py Проект: whoishu/OpinionMiner

class OpinionSentenceFinder:
    def __init__(self, features, feature_sentences):
        self.feature_sentences = feature_sentences
        self.opinion_sentences = []
        self.features = features
        self.__init_corpora()
        for sent_index in xrange(len(self.feature_sentences)):
            sent = self.feature_sentences[sent_index]
            self.feature_sentences[sent_index]['opinion_sent'] = []
            for feature in self.features:
                feature = feature[0]
                if feature in sent['nouns'] or feature in sent['noun_phrases']:
                    for index in xrange(len(sent['tags'])):
                        (w, t) = sent['tags'][index]
                        if w.find(feature.split()[0]) > -1:
                            JJ = self.get_nearest_JJ(sent['tags'], index)
                            self.feature_sentences[sent_index][
                                'opinion_sent'].append((feature, JJ))
                            self.opinion_sentences.append((feature, JJ))

    def __init_corpora(self):
        self.negation_words = WordListCorpusReader('../data/corpora/',
                                                   'negation_words')
        self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends')
        self.negative_sentiments = WordListCorpusReader(
            '../data/corpora/sentiment-lexicon', 'negative-words.txt')
        self.positive_sentiments = WordListCorpusReader(
            '../data/corpora/sentiment-lexicon', 'positive-words.txt')

    def remove_uncertain_features(self):
        None

    """
		Todo: concat consecutive JJ's (Opt.) 
		      Remove meaningless JJ's (95% done.)
		      Implement lemmatizing while checking JJ's
		      Stop scanning for JJ's, after the period or ',' or other sentence ends (done.)
		      Negation of opinions. (done.)
		      (Opt.) Append (RR, RB) to the JJ
		      Special treatment for NOUNS in pros
			Fix neg bug
	"""

    def get_nearest_JJ(self, tags, n_index):
        adj = ''
        neg = ''
        sentiment = None
        for i in xrange(n_index + 1, len(tags)):
            (w, t) = tags[i]
            if w in self.sent_ends.words():
                break
            if w in self.negation_words.words():
                neg = w
            if t in ['JJ', 'JJR', 'JJS']:
                adj = w
            if unicode.encode(w) in self.negative_sentiments.words():
                adj = w
                sentiment = False
            if unicode.encode(w) in self.positive_sentiments.words():
                adj = w
                sentiment = True
                break
        start = n_index
        if len(adj) < 1:
            end = -1
            neg = ''
        else:
            end = n_index - (i - n_index) - 1
        for j in xrange(start, end, -1):
            (w, t) = tags[j]
            if w in self.sent_ends.words():
                break
            if w in self.negation_words.words():
                neg = w
            if t in ['JJ', 'JJR', 'JJS']:
                adj = w
            if unicode.encode(w) in self.negative_sentiments.words():
                adj = w
                sentiment = False
            if unicode.encode(w) in self.positive_sentiments.words():
                adj = w
                sentiment = True
                break
        if len(neg) > 1:
            sentiment = not sentiment
        return (sentiment, neg, adj)

Python WordListCorpusReader примеры использования