def extract_keywords(text, stopwords_pattern): """ Calls the RAKE module (from github.com/aneesha/RAKE/rake.py with very minor modifications) on the full description to create less noisy text for vectorization. """ sentences = rk.splitSentences(text) phrase_list = rk.generateCandidateKeywords(sentences, stopwords_pattern) word_scores = rk.calculateWordScores(phrase_list) keyword_candidates = rk.generateCandidateKeywordScores( phrase_list, word_scores) sorted_keywords = sorted(keyword_candidates.iteritems(), key=operator.itemgetter(1), reverse=True) n_keywords = len(sorted_keywords) return " ".join(map(lambda x: x[0], sorted_keywords[0:int(n_keywords / 3)]))
def extract_keywords(text, stopwords_pattern): """ Calls the RAKE module (from github.com/aneesha/RAKE/rake.py with very minor modifications) on the full description to create less noisy text for vectorization. """ sentences = rk.splitSentences(text) phrase_list = rk.generateCandidateKeywords(sentences, stopwords_pattern) word_scores = rk.calculateWordScores(phrase_list) keyword_candidates = rk.generateCandidateKeywordScores( phrase_list, word_scores) sorted_keywords = sorted(keyword_candidates.iteritems(), key=operator.itemgetter(1), reverse=True) n_keywords = len(sorted_keywords) return " ".join(map(lambda x: x[0], sorted_keywords[0:int(n_keywords / 3)]))
def extract(text): # Extracts keywords from text # preprocess, tokenize, group in n-grams text = util.textprocess.preprocess(text) sentences = [separatewords(sentence, 0) for sentence in splitSentences(text)] phrases = [ngram for sentence in sentences for n in range(3) for ngram in generate_ngrams(sentence, n + 1)] phrases = prefilter(phrases) # casings casings = {} for phrase in phrases: casings[' '.join(phrase)] = casing.normalize(phrase) # RAKE wordscores = calculateWordScores(phrases) keywords = generateCandidateKeywordScores(phrases, wordscores) # Factor in IDF for keyphrase in keywords.keys(): idfScore = idf.get(keyphrase) if idfScore == 0: del keywords[keyphrase] continue keywords[keyphrase] /= idfScore # Post filter filtered = postfilter(keywords.keys()) for keyphrase in keywords.keys(): if keyphrase not in filtered: del keywords[keyphrase] # Normalize scores if len(filtered) > 0: maxWeight = max([keywords[keyphrase] for keyphrase in filtered]) if maxWeight > 0: for keyphrase in keywords.keys(): keywords[keyphrase] /= maxWeight # format keywords = sorted(keywords.iteritems(), key=operator.itemgetter(1), reverse=True) return [{"keyword":casings.setdefault(pair[0], pair[0]), "weight":pair[1]} for pair in keywords]