def __init__(self, language, sw_files=[], load_default=True): self.language = language self.stopwords = [] if load_default: wlcr = WordListCorpusReader(data.GzipFileSystemPathPointer(DEFAULT_SW_FILE), [language], encoding="utf-8") self.stopwords = wlcr.words(language) logging.info("Loaded default stopwords from file %s" % DEFAULT_SW_FILE) path = BASE_SW_PATH + language for sw_file in sw_files: wlcr = WordListCorpusReader(data.FileSystemPathPointer(path), sw_file, encoding="utf-8") self.stopwords += wlcr.words(sw_file) logging.info("Loaded stopwords from file '%s'" % sw_file)
def load_token_list(term_file): ''' load some stopword list from the corpus ''' __location__ = '../corpora/' tokens = WordListCorpusReader(__location__, term_file) return [w.replace('+', '\+') for w in tokens.words()]
def load_token_list(term_file): ''' load some stopword list from the corpus ''' __location__ = os.path.join(os.path.abspath(os.path.dirname(__file__)), _corpus_root) tokens = WordListCorpusReader(__location__, term_file) return [w.replace('+', '\+') for w in tokens.words()]
def load_token_list(term_file): ''' load some stopword list from the corpus ''' __location__ = os.path.join( os.path.abspath(os.path.dirname(__file__)), _corpus_root) tokens = WordListCorpusReader(__location__, term_file) return [w.replace('+', '\+') for w in tokens.words()]
def extract_mimetypes(text, do_replace=True): ''' pull a list of mimetypes from some text feature return a list of mimetypes in the text block and the text, without mimetypes or unmodified ''' mimetypes = WordListCorpusReader(_corpus_root, 'mimetypes.txt') found_mimetypes = [w for w in mimetypes.words() if w in text] if do_replace: text = remove_tokens('mimetypes.txt', text) return found_mimetypes, text
def read_stopwords(path): '''使用nltk读停用词表 ''' root,fileid=os.path.split(path) stopwords=WordListCorpusReader(root,[fileid]) return stopwords.words(fileid)
class OpinionSentenceFinder: def __init__(self, features, feature_sentences): self.feature_sentences = feature_sentences self.opinion_sentences = [] self.features = features self.__init_corpora() for sent_index in xrange(len(self.feature_sentences)): sent = self.feature_sentences[sent_index] self.feature_sentences[sent_index]['opinion_sent'] = [] for feature in self.features: feature = feature[0] if feature in sent['nouns'] or feature in sent['noun_phrases']: for index in xrange(len(sent['tags'])): (w, t) = sent['tags'][index] if w.find(feature.split()[0]) > -1: JJ = self.get_nearest_JJ(sent['tags'], index) self.feature_sentences[sent_index]['opinion_sent'].append((feature, JJ)) self.opinion_sentences.append((feature, JJ)) def __init_corpora(self): self.negation_words = WordListCorpusReader('../data/corpora/', 'negation_words') self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends') self.negative_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'negative-words.txt') self.positive_sentiments = WordListCorpusReader('../data/corpora/sentiment-lexicon', 'positive-words.txt') def remove_uncertain_features(self): None """ Todo: concat consecutive JJ's (Opt.) Remove meaningless JJ's (95% done.) Implement lemmatizing while checking JJ's Stop scanning for JJ's, after the period or ',' or other sentence ends (done.) Negation of opinions. (done.) (Opt.) Append (RR, RB) to the JJ Special treatment for NOUNS in pros Fix neg bug """ def get_nearest_JJ(self, tags, n_index): adj = '' neg = '' sentiment = None for i in xrange(n_index + 1, len(tags)): (w, t) = tags[i] if w in self.sent_ends.words(): break if w in self.negation_words.words(): neg = w if t in ['JJ', 'JJR', 'JJS']: adj = w if unicode.encode(w) in self.negative_sentiments.words(): adj = w sentiment = False if unicode.encode(w) in self.positive_sentiments.words(): adj = w sentiment = True break start = n_index if len(adj) < 1: end = -1 neg = '' else: end = n_index - (i - n_index) - 1 for j in xrange(start, end, -1): (w, t) = tags[j] if w in self.sent_ends.words(): break if w in self.negation_words.words(): neg = w if t in ['JJ', 'JJR', 'JJS']: adj = w if unicode.encode(w) in self.negative_sentiments.words(): adj = w sentiment = False if unicode.encode(w) in self.positive_sentiments.words(): adj = w sentiment = True break if len(neg) > 1: sentiment = not sentiment return (sentiment, neg, adj)
class OpinionSentenceCollector: def __init__(self, features, feature_sentences): self.features = features self.feature_sentences = feature_sentences self.opinion_sentences = [] self.opinion_features = [] self.init_corpus() for sentence_index in xrange(len(self.feature_sentences)): sentence = self.feature_sentences[sentence_index] self.feature_sentences[sentence_index]['opinion_sentence'] = [] for feature in self.features: #Extracting the feature from (feature, count) tuple feature = feature[0] if feature in sentence['nouns'] or feature in sentence['noun_phrases']: for tag_index in xrange(len(sentence['tags'])): (word, tag) = sentence['tags'][tag_index] if(word.find(feature.split()[0])) > -1: (sentiment_score, opinion) = self.calculate_sent_score(sentence['tags'], tag_index) if len(opinion) > 0: self.opinion_features.append(feature) self.opinion_sentences.append((feature, sentiment_score, sentence['sentence'])) def init_corpus(self): self.negation_words = WordListCorpusReader('../data/corpus/', 'negation-words.txt') self.negative_sentiments = WordListCorpusReader('../data/corpus/', 'negative-words.txt') self.positive_sentiments = WordListCorpusReader('../data/corpus/', 'positive-words.txt') def calculate_sent_score(self, tags, tag_index): positive_sentiment_score = 0 negative_sentiment_score = 0 adjective = '' negation_words = '' for i in xrange(tag_index + 1, len(tags)): (word, tag) = tags[i] if word in self.negation_words.words(): negation_words = word if tag in ['JJ', 'JJR', 'JJS']: adjective = word if word in self.negative_sentiments.words(): adjective = word if not len(negation_words) > 0: negative_sentiment_score += 1 else: positive_sentiment_score += 1 if word in self.positive_sentiments.words(): adjective = word if not len(negation_words) > 0: positive_sentiment_score += 1 else: negative_sentiment_score += 1 start = 0 negation_words = '' for j in xrange(start, tag_index): (word, tag) = tags[j] if word in self.negation_words.words(): negation_words = word if tag in ['JJ', 'JJR', 'JJS']: adjective = word if word in self.negative_sentiments.words(): adjective = word if not len(negation_words) > 0: negative_sentiment_score += 1 else: positive_sentiment_score += 1 if word in self.positive_sentiments.words(): if not len(negation_words) > 0: positive_sentiment_score += 1 else: negative_sentiment_score += 1 final_score = positive_sentiment_score - negative_sentiment_score #print "Sentiment Score", final_score, adjective return final_score, adjective
class OpinionSentenceFinder: def __init__(self, features, feature_sentences): self.feature_sentences = feature_sentences self.opinion_sentences = [] self.features = features self.__init_corpora() for sent_index in xrange(len(self.feature_sentences)): sent = self.feature_sentences[sent_index] self.feature_sentences[sent_index]['opinion_sent'] = [] for feature in self.features: feature = feature[0] if feature in sent['nouns'] or feature in sent['noun_phrases']: for index in xrange(len(sent['tags'])): (w, t) = sent['tags'][index] if w.find(feature.split()[0]) > -1: JJ = self.get_nearest_JJ(sent['tags'], index) self.feature_sentences[sent_index][ 'opinion_sent'].append((feature, JJ)) self.opinion_sentences.append((feature, JJ)) def __init_corpora(self): self.negation_words = WordListCorpusReader('../data/corpora/', 'negation_words') self.sent_ends = WordListCorpusReader('../data/corpora', 'sent_ends') self.negative_sentiments = WordListCorpusReader( '../data/corpora/sentiment-lexicon', 'negative-words.txt') self.positive_sentiments = WordListCorpusReader( '../data/corpora/sentiment-lexicon', 'positive-words.txt') def remove_uncertain_features(self): None """ Todo: concat consecutive JJ's (Opt.) Remove meaningless JJ's (95% done.) Implement lemmatizing while checking JJ's Stop scanning for JJ's, after the period or ',' or other sentence ends (done.) Negation of opinions. (done.) (Opt.) Append (RR, RB) to the JJ Special treatment for NOUNS in pros Fix neg bug """ def get_nearest_JJ(self, tags, n_index): adj = '' neg = '' sentiment = None for i in xrange(n_index + 1, len(tags)): (w, t) = tags[i] if w in self.sent_ends.words(): break if w in self.negation_words.words(): neg = w if t in ['JJ', 'JJR', 'JJS']: adj = w if unicode.encode(w) in self.negative_sentiments.words(): adj = w sentiment = False if unicode.encode(w) in self.positive_sentiments.words(): adj = w sentiment = True break start = n_index if len(adj) < 1: end = -1 neg = '' else: end = n_index - (i - n_index) - 1 for j in xrange(start, end, -1): (w, t) = tags[j] if w in self.sent_ends.words(): break if w in self.negation_words.words(): neg = w if t in ['JJ', 'JJR', 'JJS']: adj = w if unicode.encode(w) in self.negative_sentiments.words(): adj = w sentiment = False if unicode.encode(w) in self.positive_sentiments.words(): adj = w sentiment = True break if len(neg) > 1: sentiment = not sentiment return (sentiment, neg, adj)