def extract_keywords(text, stopwords_pattern): """ Calls the RAKE module (from github.com/aneesha/RAKE/rake.py with very minor modifications) on the full description to create less noisy text for vectorization. """ sentences = rk.splitSentences(text) phrase_list = rk.generateCandidateKeywords(sentences, stopwords_pattern) word_scores = rk.calculateWordScores(phrase_list) keyword_candidates = rk.generateCandidateKeywordScores( phrase_list, word_scores) sorted_keywords = sorted(keyword_candidates.iteritems(), key=operator.itemgetter(1), reverse=True) n_keywords = len(sorted_keywords) return " ".join(map(lambda x: x[0], sorted_keywords[0:int(n_keywords / 3)]))
def rake(text, skillfilter=None): # preprocess text text = textprocess.preprocess(text) # tokenize sentenceList = splitSentences(text) phraseList = generateCandidateKeywords(sentenceList, stopwordpattern) # generate candidates and calculate scores wordscores = calculateWordScores(phraseList) keywordcandidates = generateCandidateKeywordScores(phraseList, wordscores) scored_ngrams = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True) # pre/post-filter if skillfilter != None: scored_ngrams = [(ngram, score) for (ngram, score) in scored_ngrams if ngram in skilldict] # format return [{'keyword':pair[0], 'weight':pair[1]} for pair in scored_ngrams]
def extract(text): # Extracts keywords from text # preprocess, tokenize, group in n-grams text = util.textprocess.preprocess(text) sentences = [separatewords(sentence, 0) for sentence in splitSentences(text)] phrases = [ngram for sentence in sentences for n in range(3) for ngram in generate_ngrams(sentence, n + 1)] phrases = prefilter(phrases) # casings casings = {} for phrase in phrases: casings[' '.join(phrase)] = casing.normalize(phrase) # RAKE wordscores = calculateWordScores(phrases) keywords = generateCandidateKeywordScores(phrases, wordscores) # Factor in IDF for keyphrase in keywords.keys(): idfScore = idf.get(keyphrase) if idfScore == 0: del keywords[keyphrase] continue keywords[keyphrase] /= idfScore # Post filter filtered = postfilter(keywords.keys()) for keyphrase in keywords.keys(): if keyphrase not in filtered: del keywords[keyphrase] # Normalize scores if len(filtered) > 0: maxWeight = max([keywords[keyphrase] for keyphrase in filtered]) if maxWeight > 0: for keyphrase in keywords.keys(): keywords[keyphrase] /= maxWeight # format keywords = sorted(keywords.iteritems(), key=operator.itemgetter(1), reverse=True) return [{"keyword":casings.setdefault(pair[0], pair[0]), "weight":pair[1]} for pair in keywords]
def naive(text, skillfilter=None, jointfilter=True): # preprocess text = textprocess.preprocess(text) # generate word scores wordscores = calculateWordScores(text) # tokenize tokens = text.split() # prefilter if skillfilter == 'pre': tokens = [token for token in tokens if token in skilldict] phraseList = [ngram for n in range(3) for ngram in generate_ngrams(tokens, n + 1)] keywordcandidates = generateCandidateKeywordScores(phraseList, wordscores) scored_ngrams = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True) # post-filter if skillfilter == 'post': scored_ngrams = [(ngram, score) for (ngram, score) in scored_ngrams if ngram in skilldict] # format return [{'keyword':pair[0], 'weight':pair[1]} for pair in scored_ngrams]
def crackr(text, skillfilter=None): # skillfilter can be one of # - None: don't filter # - "pre": filter a priori # - "post": filter a posteriori # preprocess text text = textprocess.preprocess(text).lower() # tokenize sentenceList = splitSentences(text) phraseList = generateCandidateKeywords(sentenceList, stopwordpattern) wordscores = calculateWordScores(phraseList) # generate ngrams tokens = text.split() # pre-filter if skillfilter == 'pre': tokens = [token for token in tokens if token in skilldict] ngrams = [ngram for n in range(3) for ngram in generate_ngrams(tokens, n + 1)] # filter clusters viableclusters = [] for ngram in ngrams: try: viableclusters += [words[ngram] [1]] except: pass # filter words by clusters viablewords = [] for cluster in set(viableclusters): for (word, _) in clusters[cluster]: viablewords += [word] viablewords = set(viablewords) # pos tag pos_tagged = pt.tag(text) # filter single words index = 0 finallst = [] indices = [] thirdlst = [] for tup in pos_tagged: word, tag = tup if tag == 'CC': print word print pos_tagged[index - 1] [1] [0], pos_tagged[index - 1] [1] [0] if tag[0] == 'N' or tag[0] == 'J' or (tag == 'CC' and pos_tagged[index - 1] [1] [0] == 'N'): if word.lower() in viablewords: finallst += [(word)] indices += [index] thirdlst += [(word, index)] index += 1 # generate keyword phrases ngrams = stich(finallst, indices) ngrams = [" ".join(ngram) for ngram in ngrams] keywordcandidates = generateCandidateKeywordScores(phraseList, wordscores) scored_ngrams = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True) # post-filter if skillfilter == 'post': scored_ngrams = [(ngram, score) for (ngram, score) in scored_ngrams if ngram in skilldict] # format return [{'keyword':pair[0], 'weight':pair[1]} for pair in scored_ngrams]