Пример #1
0
def extractNP(CONTENT):
	stopwords = getList()
	grammer = r"""
		NBAR:
			{<NN.*|JJ>*<NN.*>}  
		NP:
			{<NBAR>}
			{<NBAR><IN><NBAR>} 

		"""
	chunker = nltk.RegexpParser(grammer)	# create a chunker with the parser
#	sentences = nltk.sent_tokenize(CONTENT)
	lemmaobj = WordNetLemmatizer()
	words  = []
	paragraphs = [p for p in CONTENT.split('\n') if p]
	for para in paragraphs:
		sentences = [s for s in nltk.sent_tokenize(para) if s]
		for sentence in sentences:
			word = [w.lower() for w in nltk.word_tokenize(sentence)]
			taggedwords = nltk.pos_tag(word)
			tree = chunker.parse(taggedwords)
			temp = [] 
			for subtree in tree.subtrees():
				if subtree.label() == "NP":
					for leaves in subtree.leaves():
						w = leaves[0].lower()
						if w not in stopwords:
							w = lemmaobj.lemmatize(w)
							temp.append(w)
					if temp!=[]:
						words.append(temp)
						temp = []
	return words 			
Пример #2
0
 def __init__(self,additional_stopwords):
   #self.stopwords = set(nltk.corpus.stopwords.words())
   self.stopwords = set(getList())
   self.stopwords = self.stopwords | additional_stopwords	
   self.top_fraction = 4 # consider top third candidate keywords by score
Пример #3
0
  def __init__(self):
	# self.stopwords = set(nltk.corpus.stopwords.words())
	self.stopwords  = getList()  
	self.top_fraction = 2 # consider top third candidate keywords by score
Пример #4
0
import operator
import sys
from textblob import TextBlob
from rake import RakeKeywordExtractor
from textblob.np_extractors import ConllExtractor
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WordPunctTokenizer
from textblob.taggers import NLTKTagger
from stopwordList import getList
import codecs
## GLOBAL VARIABLES 
top_fraction = 1
LEMMA_OBJ = WordNetLemmatizer()
tokenizer = WordPunctTokenizer()
nltk_tagger = NLTKTagger()
stopwords = getList()
COLL_OBJ = ConllExtractor()	

def rake_extract(phrase_list):
	RAKE_OBJ = RakeKeywordExtractor(set([]))
	word_scores = RAKE_OBJ._calculate_word_scores(phrase_list)
	phrase_scores = RAKE_OBJ._calculate_phrase_scores(phrase_list, word_scores)
	sorted_phrase_scores = sorted(phrase_scores.iteritems(),key=operator.itemgetter(1), reverse=True)
	n_phrases = len(sorted_phrase_scores)
	return sorted_phrase_scores[0:int(n_phrases)]

	

#FILE = open(sys.argv[1],"r")	
FILE = codecs.open(sys.argv[1],"r","iso8859-15")	
CONTENT = FILE.read()