Python WordListCorpusReader.words示例，nltk.corpus.WordListCorpusReader.words Python示例

示例#1

0

显示文件

    def __init__(self, language, sw_files=[], load_default=True):
        self.language = language
        self.stopwords = []

        if load_default:
            wlcr = WordListCorpusReader(data.GzipFileSystemPathPointer(DEFAULT_SW_FILE), [language], encoding="utf-8")
            self.stopwords = wlcr.words(language)
            logging.info("Loaded default stopwords from file %s" % DEFAULT_SW_FILE)

        path = BASE_SW_PATH + language
        for sw_file in sw_files:
            wlcr = WordListCorpusReader(data.FileSystemPathPointer(path), sw_file, encoding="utf-8")
            self.stopwords += wlcr.words(sw_file)
            logging.info("Loaded stopwords from file '%s'" % sw_file)

示例#2

0

显示文件

文件： load_word_counts.py 项目： b-cube/Response-Identification-Info

def load_token_list(term_file):
    '''
    load some stopword list from the corpus
    '''
    __location__ = '../corpora/'
    tokens = WordListCorpusReader(__location__, term_file)
    return [w.replace('+', '\+') for w in tokens.words()]

示例#3

0

显示文件

def load_token_list(term_file):
    '''
    load some stopword list from the corpus
    '''
    __location__ = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                _corpus_root)
    tokens = WordListCorpusReader(__location__, term_file)
    return [w.replace('+', '\+') for w in tokens.words()]

示例#4

0

显示文件

文件： nlp_utils.py 项目： Sandy4321/semantics-preprocessing

def load_token_list(term_file):
    '''
    load some stopword list from the corpus
    '''
    __location__ = os.path.join(
        os.path.abspath(os.path.dirname(__file__)), _corpus_root)
    tokens = WordListCorpusReader(__location__, term_file)
    return [w.replace('+', '\+') for w in tokens.words()]

示例#5

0

显示文件

def extract_mimetypes(text, do_replace=True):
    '''
    pull a list of mimetypes from some text feature

    return a list of mimetypes in the text block and
    the text, without mimetypes or unmodified
    '''
    mimetypes = WordListCorpusReader(_corpus_root, 'mimetypes.txt')

    found_mimetypes = [w for w in mimetypes.words() if w in text]

    if do_replace:
        text = remove_tokens('mimetypes.txt', text)

    return found_mimetypes, text

示例#6

0

显示文件

文件： nlp_utils.py 项目： Sandy4321/semantics-preprocessing

def extract_mimetypes(text, do_replace=True):
    '''
    pull a list of mimetypes from some text feature

    return a list of mimetypes in the text block and
    the text, without mimetypes or unmodified
    '''
    mimetypes = WordListCorpusReader(_corpus_root, 'mimetypes.txt')

    found_mimetypes = [w for w in mimetypes.words() if w in text]

    if do_replace:
        text = remove_tokens('mimetypes.txt', text)

    return found_mimetypes, text

示例#7

0

显示文件

文件： nlp.py 项目： tangxiangru/Factoid-QA-subtask

def read_stopwords(path):
    '''使用nltk读停用词表
    '''
    root,fileid=os.path.split(path)
    stopwords=WordListCorpusReader(root,[fileid])
    return stopwords.words(fileid)

示例#8

0

显示文件

文件： OpinionSentenceFinder.py 项目： SimonAtGitHub/OpinionMiner

class OpinionSentenceFinder:
	def __init__(self, features, feature_sentences):
		self.feature_sentences = feature_sentences
		self.opinion_sentences = []
		self.features = features
		self.__init_corpora()
		for sent_index in xrange(len(self.feature_sentences)):
			sent = self.feature_sentences[sent_index]
			self.feature_sentences[sent_index]['opinion_sent'] = []
			for feature in self.features:
				feature = feature[0]
				if feature in sent['nouns'] or feature in sent['noun_phrases']:
					for index in xrange(len(sent['tags'])):
						(w, t) = sent['tags'][index]
						if w.find(feature.split()[0]) > -1:
							JJ = self.get_nearest_JJ(sent['tags'], index)
							self.feature_sentences[sent_index]['opinion_sent'].append((feature, JJ))
							self.opinion_sentences.append((feature, JJ))
		
	def __init_corpora(self):
		self.negation_words = WordListCorpusReader('../data/corpora/', 'negation_words')
		self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends')
		self.negative_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'negative-words.txt')
		self.positive_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'positive-words.txt')

					
	def remove_uncertain_features(self):
		None
	"""
		Todo: concat consecutive JJ's (Opt.) 
		      Remove meaningless JJ's (95% done.)
		      Implement lemmatizing while checking JJ's
		      Stop scanning for JJ's, after the period or ',' or other sentence ends (done.)
		      Negation of opinions. (done.)
		      (Opt.) Append (RR, RB) to the JJ
		      Special treatment for NOUNS in pros
			Fix neg bug
	"""
	def get_nearest_JJ(self, tags, n_index):
		adj = ''
		neg = ''
		sentiment = None
		for i in xrange(n_index + 1, len(tags)):
			(w, t) = tags[i]
			if w in self.sent_ends.words():
				break
			if w in self.negation_words.words():
				neg = w
			if t in ['JJ', 'JJR', 'JJS']:
				adj = w
			if unicode.encode(w) in self.negative_sentiments.words():
				adj = w
				sentiment = False
			if unicode.encode(w) in self.positive_sentiments.words():
				adj = w
				sentiment = True
				break
		start = n_index
		if len(adj) < 1:
			end = -1
			neg = ''
		else:
			end = n_index - (i - n_index) - 1
		for j in xrange(start, end, -1):
			(w, t) = tags[j] 
			if w in self.sent_ends.words():
				break
			if w in self.negation_words.words():
				neg = w
			if t in ['JJ', 'JJR', 'JJS']:
				adj = w
			if unicode.encode(w) in self.negative_sentiments.words():
				adj = w
				sentiment = False
			if unicode.encode(w) in self.positive_sentiments.words():
				adj = w
				sentiment = True
				break
		if len(neg) > 1:
			sentiment = not sentiment
		return (sentiment, neg, adj)

示例#9

0

显示文件

文件： OpinionSentenceCollector.py 项目： sgudla/OpninionMining

class OpinionSentenceCollector:
    def __init__(self, features, feature_sentences):
        self.features = features
        self.feature_sentences = feature_sentences
        self.opinion_sentences = []
        self.opinion_features = []

        self.init_corpus()

        for sentence_index in xrange(len(self.feature_sentences)):
            sentence = self.feature_sentences[sentence_index]
            self.feature_sentences[sentence_index]['opinion_sentence'] = []
            for feature in self.features:
                #Extracting the feature from (feature, count) tuple
                feature = feature[0]
                if feature in sentence['nouns'] or feature in sentence['noun_phrases']:
                    for tag_index in xrange(len(sentence['tags'])):
                        (word, tag) = sentence['tags'][tag_index]
                        if(word.find(feature.split()[0])) > -1:
                            (sentiment_score, opinion) = self.calculate_sent_score(sentence['tags'], tag_index)
                            if len(opinion) > 0:
                                self.opinion_features.append(feature)
                                self.opinion_sentences.append((feature, sentiment_score, sentence['sentence']))

    def init_corpus(self):
        self.negation_words = WordListCorpusReader('../data/corpus/', 'negation-words.txt')
        self.negative_sentiments = WordListCorpusReader('../data/corpus/', 'negative-words.txt')
        self.positive_sentiments = WordListCorpusReader('../data/corpus/', 'positive-words.txt')


    def calculate_sent_score(self, tags, tag_index):

        positive_sentiment_score = 0
        negative_sentiment_score = 0
        adjective = ''
        negation_words = ''

        for i in xrange(tag_index + 1, len(tags)):
            (word, tag) = tags[i]
            if word in self.negation_words.words():
                negation_words = word
            if tag in ['JJ', 'JJR', 'JJS']:
                adjective = word
                if word in self.negative_sentiments.words():
                    adjective = word
                    if not len(negation_words) > 0:
                        negative_sentiment_score += 1
                    else:
                        positive_sentiment_score += 1
                if word in self.positive_sentiments.words():
                    adjective = word
                    if not len(negation_words) > 0:
                        positive_sentiment_score += 1
                    else:
                        negative_sentiment_score += 1

        start = 0
        negation_words = ''

        for j in xrange(start, tag_index):
            (word, tag) = tags[j]
            if word in self.negation_words.words():
                negation_words = word
            if tag in ['JJ', 'JJR', 'JJS']:
                adjective = word
                if word in self.negative_sentiments.words():
                    adjective = word
                    if not len(negation_words) > 0:
                        negative_sentiment_score += 1
                    else:
                        positive_sentiment_score += 1
                if word in self.positive_sentiments.words():
                    if not len(negation_words) > 0:
                        positive_sentiment_score += 1
                    else:
                        negative_sentiment_score += 1

        final_score = positive_sentiment_score - negative_sentiment_score

        #print "Sentiment Score", final_score, adjective
        return final_score, adjective

示例#10

0

显示文件

文件： OpinionSentenceFinder.py 项目： whoishu/OpinionMiner

class OpinionSentenceFinder:
    def __init__(self, features, feature_sentences):
        self.feature_sentences = feature_sentences
        self.opinion_sentences = []
        self.features = features
        self.__init_corpora()
        for sent_index in xrange(len(self.feature_sentences)):
            sent = self.feature_sentences[sent_index]
            self.feature_sentences[sent_index]['opinion_sent'] = []
            for feature in self.features:
                feature = feature[0]
                if feature in sent['nouns'] or feature in sent['noun_phrases']:
                    for index in xrange(len(sent['tags'])):
                        (w, t) = sent['tags'][index]
                        if w.find(feature.split()[0]) > -1:
                            JJ = self.get_nearest_JJ(sent['tags'], index)
                            self.feature_sentences[sent_index][
                                'opinion_sent'].append((feature, JJ))
                            self.opinion_sentences.append((feature, JJ))

    def __init_corpora(self):
        self.negation_words = WordListCorpusReader('../data/corpora/',
                                                   'negation_words')
        self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends')
        self.negative_sentiments = WordListCorpusReader(
            '../data/corpora/sentiment-lexicon', 'negative-words.txt')
        self.positive_sentiments = WordListCorpusReader(
            '../data/corpora/sentiment-lexicon', 'positive-words.txt')

    def remove_uncertain_features(self):
        None

    """
		Todo: concat consecutive JJ's (Opt.) 
		      Remove meaningless JJ's (95% done.)
		      Implement lemmatizing while checking JJ's
		      Stop scanning for JJ's, after the period or ',' or other sentence ends (done.)
		      Negation of opinions. (done.)
		      (Opt.) Append (RR, RB) to the JJ
		      Special treatment for NOUNS in pros
			Fix neg bug
	"""

    def get_nearest_JJ(self, tags, n_index):
        adj = ''
        neg = ''
        sentiment = None
        for i in xrange(n_index + 1, len(tags)):
            (w, t) = tags[i]
            if w in self.sent_ends.words():
                break
            if w in self.negation_words.words():
                neg = w
            if t in ['JJ', 'JJR', 'JJS']:
                adj = w
            if unicode.encode(w) in self.negative_sentiments.words():
                adj = w
                sentiment = False
            if unicode.encode(w) in self.positive_sentiments.words():
                adj = w
                sentiment = True
                break
        start = n_index
        if len(adj) < 1:
            end = -1
            neg = ''
        else:
            end = n_index - (i - n_index) - 1
        for j in xrange(start, end, -1):
            (w, t) = tags[j]
            if w in self.sent_ends.words():
                break
            if w in self.negation_words.words():
                neg = w
            if t in ['JJ', 'JJR', 'JJS']:
                adj = w
            if unicode.encode(w) in self.negative_sentiments.words():
                adj = w
                sentiment = False
            if unicode.encode(w) in self.positive_sentiments.words():
                adj = w
                sentiment = True
                break
        if len(neg) > 1:
            sentiment = not sentiment
        return (sentiment, neg, adj)