def extract_keywords(text, stopwords_pattern): """ Calls the RAKE module (from github.com/aneesha/RAKE/rake.py with very minor modifications) on the full description to create less noisy text for vectorization. """ sentences = rk.splitSentences(text) phrase_list = rk.generateCandidateKeywords(sentences, stopwords_pattern) word_scores = rk.calculateWordScores(phrase_list) keyword_candidates = rk.generateCandidateKeywordScores( phrase_list, word_scores) sorted_keywords = sorted(keyword_candidates.iteritems(), key=operator.itemgetter(1), reverse=True) n_keywords = len(sorted_keywords) return " ".join(map(lambda x: x[0], sorted_keywords[0:int(n_keywords / 3)]))
def rake(text, skillfilter=None): # preprocess text text = textprocess.preprocess(text) # tokenize sentenceList = splitSentences(text) phraseList = generateCandidateKeywords(sentenceList, stopwordpattern) # generate candidates and calculate scores wordscores = calculateWordScores(phraseList) keywordcandidates = generateCandidateKeywordScores(phraseList, wordscores) scored_ngrams = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True) # pre/post-filter if skillfilter != None: scored_ngrams = [(ngram, score) for (ngram, score) in scored_ngrams if ngram in skilldict] # format return [{'keyword':pair[0], 'weight':pair[1]} for pair in scored_ngrams]
def crackr(text, skillfilter=None): # skillfilter can be one of # - None: don't filter # - "pre": filter a priori # - "post": filter a posteriori # preprocess text text = textprocess.preprocess(text).lower() # tokenize sentenceList = splitSentences(text) phraseList = generateCandidateKeywords(sentenceList, stopwordpattern) wordscores = calculateWordScores(phraseList) # generate ngrams tokens = text.split() # pre-filter if skillfilter == 'pre': tokens = [token for token in tokens if token in skilldict] ngrams = [ngram for n in range(3) for ngram in generate_ngrams(tokens, n + 1)] # filter clusters viableclusters = [] for ngram in ngrams: try: viableclusters += [words[ngram] [1]] except: pass # filter words by clusters viablewords = [] for cluster in set(viableclusters): for (word, _) in clusters[cluster]: viablewords += [word] viablewords = set(viablewords) # pos tag pos_tagged = pt.tag(text) # filter single words index = 0 finallst = [] indices = [] thirdlst = [] for tup in pos_tagged: word, tag = tup if tag == 'CC': print word print pos_tagged[index - 1] [1] [0], pos_tagged[index - 1] [1] [0] if tag[0] == 'N' or tag[0] == 'J' or (tag == 'CC' and pos_tagged[index - 1] [1] [0] == 'N'): if word.lower() in viablewords: finallst += [(word)] indices += [index] thirdlst += [(word, index)] index += 1 # generate keyword phrases ngrams = stich(finallst, indices) ngrams = [" ".join(ngram) for ngram in ngrams] keywordcandidates = generateCandidateKeywordScores(phraseList, wordscores) scored_ngrams = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True) # post-filter if skillfilter == 'post': scored_ngrams = [(ngram, score) for (ngram, score) in scored_ngrams if ngram in skilldict] # format return [{'keyword':pair[0], 'weight':pair[1]} for pair in scored_ngrams]