示例#1
0
def completeRake(content):
    rakeObject = RakeKeywordExtractor(set([]))
    keywordList = rakeObject.extract(content, True)
    words = nltk.word_tokenize(content)
    content_lemmatized = ""
    words = list(map(lambda x: lemma_obj.lemmatize(x), words))
    content_lemmatized = ' '.join(words)
    content = content_lemmatized
    freq = {}
    freq_dist = nltk.FreqDist(words)
    keyword_freq = getKeywordFrequency(keywordList)
    adjacency_freq = getAdjacencyFrequency(keywordList, content, words)

    sortedFreqList = sorted(freq_dist.items(),
                            key=lambda x: x[1],
                            reverse=True)
    additional_stopwords = []
    for key in sortedFreqList:
        keyword_freq.setdefault(key[0], 0)
        adjacency_freq.setdefault(key[0], 0)
        if (adjacency_freq[key[0]] > keyword_freq[key[0]]):
            additional_stopwords.append(key[0])
    additional_stopwords = set(additional_stopwords)
    newRakeObject = RakeKeywordExtractor(additional_stopwords)
    newKeywordList = newRakeObject.extract(content)
    for keywords in newKeywordList:
        print keywords
示例#2
0
def completeRake(content):
	rakeObject  = RakeKeywordExtractor(set([]))
	keywordList = rakeObject.extract(content,True)
	words = nltk.word_tokenize(content)
	content_lemmatized = ""	
	words  = list(map(lambda x: lemma_obj.lemmatize(x),words))
	content_lemmatized = ' '.join(words)
	content = content_lemmatized
	freq = {}
	freq_dist  = nltk.FreqDist(words)
	keyword_freq=getKeywordFrequency(keywordList)
	adjacency_freq = getAdjacencyFrequency(keywordList,content,words)

	sortedFreqList = sorted(freq_dist.items(), key = lambda x: x[1],reverse=True)
	additional_stopwords=[]
	for key in sortedFreqList:
		keyword_freq.setdefault(key[0],0)
		adjacency_freq.setdefault(key[0],0)
		if(adjacency_freq[key[0]]>keyword_freq[key[0]]):
			additional_stopwords.append(key[0])
	additional_stopwords = set(additional_stopwords)
	newRakeObject = RakeKeywordExtractor(additional_stopwords)
	newKeywordList = newRakeObject.extract(content)
	for keywords in newKeywordList:
		print keywords
def main():
    # SET N AS THE NUMBER OF KEYWORDS TO EVALUATE FROM THE KEYWORDS LISTS
    N = 243
    base_dir =  os.path.dirname(os.path.realpath(__file__))
    stopwords_simple = os.path.join(base_dir,'stopwords_nltk.txt')
    rake = RakeKeywordExtractor(stopwords_simple)
    
    tp_total = 0
    fp_total = 0
    fn_total = 0    
    corpus = os.path.join(base_dir,'./corpus')
    txtfiles = [file for file in os.listdir(corpus) if file.endswith('.txt')]
    s=""
    for txtfile in txtfiles:
        try:        
            keyfile = os.path.join(corpus,txtfile).replace('.txt', '.key')
            content = open(os.path.join(corpus,txtfile), 'r').read().decode('utf-8')
        
            keywordsExtracted = set(rake.extract(content, incl_scores=False)[0:N])
            keywordsExpected = set(listfromfilelines(keyfile)[0:N])
    
            tp, fp, fn = confusionMatrix(keywordsExtracted, keywordsExpected);
            p, r, f1 = getF1(tp, fp, fn)

            tp_total += tp
            fp_total += fp
            fn_total += fn
      
            print "F1 for top " + str(N) + " keywords in " + txtfile + ":\t" + str(f1)
            s="" 
            for x in keywordsExtracted:
                s = s + x +" "             
            wordcloud = WordCloud().generate(s)
            plt.imshow(wordcloud)
            plt.axis('off')
            plt.show()
            raw_input(">")
            s="" 
            for x in keywordsExpected:
                s = s + x +" "             
            wordcloud = WordCloud().generate(s)
            plt.imshow(wordcloud)
            plt.axis('off')
            plt.show()
            raw_input(">")
            s=content
            wordcloud = WordCloud().generate(s)
            plt.imshow(wordcloud)
            plt.axis('off')
            plt.show()
            raw_input(">")

        except Exception,err:
            print Exception,err
           
        #COMMENT NEXT LINES IN FOR DEBUGGING
        """print "Extracted Keywords:"
示例#4
0
	for i in range(0,NUM):
		keywords = keywordList[i][0].split(' ')
		length = len(keywords)
		for word in keywords:
			keyword_freq.setdefault(word,0)
			keyword_freq[word] = keyword_freq[word] + 1
	return keyword_freq	


#f = open(sys.argv[1],'r')
f = codecs.open(sys.argv[1],'r',"iso8859-15")
content = f.read()	
content = content.encode('ascii','ignore')	
content = re.sub(r"[1-9][0-9]*\.?[0-9]*",'',content)
rakeObject  = RakeKeywordExtractor(set([]))
keywordList = rakeObject.extract(content,True)
words = nltk.word_tokenize(content)
content_lemmatized = ""	
words  = list(map(lambda x: lemma_obj.lemmatize(x),words))
content_lemmatized = ' '.join(words)
content = content_lemmatized
freq = {}
freq_dist  = nltk.FreqDist(words)
	
keyword_freq=getKeywordFrequency(keywordList)
adjacency_freq = getAdjacencyFrequency(keywordList,content,words)

sortedFreqList = sorted(freq_dist.items(), key = lambda x: x[1],reverse=True)
additional_stopwords=[]
for key in sortedFreqList:
	keyword_freq.setdefault(key[0],0)
    ).join(
    Portal,
    Peticion.folioPeticion == Portal.folioSAC).join(
    Temas, Peticion.tema_id == Temas.temaId
    ).join(
    Dependencia,
    Peticion.dependencia_id == Dependencia.dependenciaId
    )

kw = {}

temas = {}

for pet in peticion_list:
    tema_pet = pet.Temas.nomTema
    if tema_pet in temas:
        temas[tema_pet] = temas[tema_pet] + 1
    else:
        temas[tema_pet] = 1
    rake = RakeKeywordExtractor()
    kw_tmp = rake.extract(pet.Portal.descripcion)
    for word in list(kw_tmp):
      if word not in kw:
          kw[word] = 1
      else:
          kw[word] += 1
mylist = sorted(temas.items(), key=lambda x:x[1], reverse=True)

for item in mylist:
    print('"'+item[0]+'": {"peticiones_atendidas": '+str(item[1])+', "palabras_clave": "'+tema_keyword[item[0]]+'"},')
示例#6
0
querySelectAnswerTag = "SELECT ID FROM SurveyAnswer_Tags WHERE SurveyAnswerID = %d AND SurveyAnswerTagID = %d"
queryDeleteFormerTags = "DELETE FROM SurveyAnswer_Tags WHERE SurveyAnswerID = %d AND SurveyAnswerTagID IN (SELECT ID FROM SurveyAnswerTag where Type = 'AUTOMATIC')"

config = DBConfig(working_dir+"/db.ini").read_db_config()

cursor = None

try:
    # Open database connection
    db = MySQLdb.connect(**config)

    # prepare a cursor object using cursor() method
    cursor = db.cursor()
    data = sql.read_sql(queryAnswers % question_id, db)
    for idx, answer in enumerate(data['Value'].tolist()):
        keywords = rake.extract(answer)
        answerId = data['ID'].values[idx]
        if must_delete_former_automatic_tags > 0:
            cursor.execute(queryDeleteFormerTags % (answerId))
        # tag processing
        for idx2, tag in enumerate(keywords[:max_tags]):
            tag = re.sub(r"[^\w\d]", "", tag)
            if len(tag) > 2:
                cursor.execute(queryTag % (tag))
                dbTag = cursor.fetchone()
                tagId = None
                if dbTag is None:
                    res = cursor.execute(queryInsertTag % (tag))
                    tagId = cursor.lastrowid
                else:
                    tagId = dbTag[0]