Пример #1
0
def extract_keywords(text, stopwords_pattern):
  """
  Calls the RAKE module (from github.com/aneesha/RAKE/rake.py
  with very minor modifications) on the full description to
  create less noisy text for vectorization.
  """
  sentences = rk.splitSentences(text)
  phrase_list = rk.generateCandidateKeywords(sentences, stopwords_pattern)
  word_scores = rk.calculateWordScores(phrase_list)
  keyword_candidates = rk.generateCandidateKeywordScores(
    phrase_list, word_scores)
  sorted_keywords = sorted(keyword_candidates.iteritems(),
    key=operator.itemgetter(1), reverse=True)
  n_keywords = len(sorted_keywords)
  return " ".join(map(lambda x: x[0],
    sorted_keywords[0:int(n_keywords / 3)]))
Пример #2
0
def extract_keywords(text, stopwords_pattern):
    """
  Calls the RAKE module (from github.com/aneesha/RAKE/rake.py
  with very minor modifications) on the full description to
  create less noisy text for vectorization.
  """
    sentences = rk.splitSentences(text)
    phrase_list = rk.generateCandidateKeywords(sentences, stopwords_pattern)
    word_scores = rk.calculateWordScores(phrase_list)
    keyword_candidates = rk.generateCandidateKeywordScores(
        phrase_list, word_scores)
    sorted_keywords = sorted(keyword_candidates.iteritems(),
                             key=operator.itemgetter(1),
                             reverse=True)
    n_keywords = len(sorted_keywords)
    return " ".join(map(lambda x: x[0],
                        sorted_keywords[0:int(n_keywords / 3)]))
Пример #3
0
def rake(text, skillfilter=None):
    # preprocess text
    text = textprocess.preprocess(text)    
    
    # tokenize
    sentenceList = splitSentences(text)
    phraseList = generateCandidateKeywords(sentenceList, stopwordpattern)
    
    # generate candidates and calculate scores
    wordscores = calculateWordScores(phraseList)
    keywordcandidates = generateCandidateKeywordScores(phraseList, wordscores)    
    scored_ngrams = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
    
    # pre/post-filter
    if skillfilter != None:
        scored_ngrams = [(ngram, score) for (ngram, score) in scored_ngrams if ngram in skilldict]
    
    # format
    return [{'keyword':pair[0], 'weight':pair[1]} for pair in scored_ngrams]
Пример #4
0
def crackr(text, skillfilter=None):
    # skillfilter can be one of
    # - None: don't filter
    # - "pre": filter a priori
    # - "post": filter a posteriori

    # preprocess text
    text = textprocess.preprocess(text).lower()
    
    # tokenize
    sentenceList = splitSentences(text)
    phraseList = generateCandidateKeywords(sentenceList, stopwordpattern)
    wordscores = calculateWordScores(phraseList)
    
    # generate ngrams    
    tokens = text.split()
    
    # pre-filter
    if skillfilter == 'pre':
        tokens = [token for token in tokens if token in skilldict]        
    ngrams = [ngram for n in range(3) for ngram in generate_ngrams(tokens, n + 1)]    
    
    # filter clusters
    viableclusters = []
    for ngram in ngrams:
        try:
            viableclusters += [words[ngram] [1]]
        except:
            pass
            
    # filter words by clusters
    viablewords = []
    for cluster in set(viableclusters):
        for (word, _) in clusters[cluster]:
            viablewords += [word]
    viablewords = set(viablewords)
    
    # pos tag
    pos_tagged = pt.tag(text)
    
    # filter single words
    index = 0
    finallst = []        
    indices = []
    thirdlst = []
    for tup in pos_tagged:
        word, tag = tup
        if tag == 'CC':
            print word
            print pos_tagged[index - 1] [1] [0], pos_tagged[index - 1] [1] [0] 
        if tag[0] == 'N' or tag[0] == 'J' or (tag == 'CC' and pos_tagged[index - 1] [1] [0] == 'N'):
            if word.lower() in viablewords:
                finallst += [(word)]
                indices += [index]
                thirdlst += [(word, index)]
        index += 1

    # generate keyword phrases
    ngrams = stich(finallst, indices)
    ngrams = [" ".join(ngram) for ngram in ngrams]
    keywordcandidates = generateCandidateKeywordScores(phraseList, wordscores)  
    scored_ngrams = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
    
    # post-filter
    if skillfilter == 'post':
        scored_ngrams = [(ngram, score) for (ngram, score) in scored_ngrams if ngram in skilldict]
    
    # format
    return [{'keyword':pair[0], 'weight':pair[1]} for pair in scored_ngrams]