Пример #1
0
def performTask(rawtext):
    text=textprocess.preprocess(rawtext)
    POS_text = pt.tag(text)
    print POS_text
    #Didn't do toLower earlier because the stanford tagger might make
    # use of the capitalizations
    text = text.lower()
    skilldict = buildskilldict(skills)
    naive_skills= generate_naive(text,skilldict)
    expanded_skills = new_guesses(POS_text,naive_skills,words,clusters)
    return naive_skills, expanded_skills
Пример #2
0
def crackr(text, skillfilter=None):
    # skillfilter can be one of
    # - None: don't filter
    # - "pre": filter a priori
    # - "post": filter a posteriori

    # preprocess text
    text = textprocess.preprocess(text).lower()
    
    # tokenize
    sentenceList = splitSentences(text)
    phraseList = generateCandidateKeywords(sentenceList, stopwordpattern)
    wordscores = calculateWordScores(phraseList)
    
    # generate ngrams    
    tokens = text.split()
    
    # pre-filter
    if skillfilter == 'pre':
        tokens = [token for token in tokens if token in skilldict]        
    ngrams = [ngram for n in range(3) for ngram in generate_ngrams(tokens, n + 1)]    
    
    # filter clusters
    viableclusters = []
    for ngram in ngrams:
        try:
            viableclusters += [words[ngram] [1]]
        except:
            pass
            
    # filter words by clusters
    viablewords = []
    for cluster in set(viableclusters):
        for (word, _) in clusters[cluster]:
            viablewords += [word]
    viablewords = set(viablewords)
    
    # pos tag
    pos_tagged = pt.tag(text)
    
    # filter single words
    index = 0
    finallst = []        
    indices = []
    thirdlst = []
    for tup in pos_tagged:
        word, tag = tup
        if tag == 'CC':
            print word
            print pos_tagged[index - 1] [1] [0], pos_tagged[index - 1] [1] [0] 
        if tag[0] == 'N' or tag[0] == 'J' or (tag == 'CC' and pos_tagged[index - 1] [1] [0] == 'N'):
            if word.lower() in viablewords:
                finallst += [(word)]
                indices += [index]
                thirdlst += [(word, index)]
        index += 1

    # generate keyword phrases
    ngrams = stich(finallst, indices)
    ngrams = [" ".join(ngram) for ngram in ngrams]
    keywordcandidates = generateCandidateKeywordScores(phraseList, wordscores)  
    scored_ngrams = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
    
    # post-filter
    if skillfilter == 'post':
        scored_ngrams = [(ngram, score) for (ngram, score) in scored_ngrams if ngram in skilldict]
    
    # format
    return [{'keyword':pair[0], 'weight':pair[1]} for pair in scored_ngrams]