def completeRake(content): rakeObject = RakeKeywordExtractor(set([])) keywordList = rakeObject.extract(content, True) words = nltk.word_tokenize(content) content_lemmatized = "" words = list(map(lambda x: lemma_obj.lemmatize(x), words)) content_lemmatized = ' '.join(words) content = content_lemmatized freq = {} freq_dist = nltk.FreqDist(words) keyword_freq = getKeywordFrequency(keywordList) adjacency_freq = getAdjacencyFrequency(keywordList, content, words) sortedFreqList = sorted(freq_dist.items(), key=lambda x: x[1], reverse=True) additional_stopwords = [] for key in sortedFreqList: keyword_freq.setdefault(key[0], 0) adjacency_freq.setdefault(key[0], 0) if (adjacency_freq[key[0]] > keyword_freq[key[0]]): additional_stopwords.append(key[0]) additional_stopwords = set(additional_stopwords) newRakeObject = RakeKeywordExtractor(additional_stopwords) newKeywordList = newRakeObject.extract(content) for keywords in newKeywordList: print keywords
def completeRake(content): rakeObject = RakeKeywordExtractor(set([])) keywordList = rakeObject.extract(content,True) words = nltk.word_tokenize(content) content_lemmatized = "" words = list(map(lambda x: lemma_obj.lemmatize(x),words)) content_lemmatized = ' '.join(words) content = content_lemmatized freq = {} freq_dist = nltk.FreqDist(words) keyword_freq=getKeywordFrequency(keywordList) adjacency_freq = getAdjacencyFrequency(keywordList,content,words) sortedFreqList = sorted(freq_dist.items(), key = lambda x: x[1],reverse=True) additional_stopwords=[] for key in sortedFreqList: keyword_freq.setdefault(key[0],0) adjacency_freq.setdefault(key[0],0) if(adjacency_freq[key[0]]>keyword_freq[key[0]]): additional_stopwords.append(key[0]) additional_stopwords = set(additional_stopwords) newRakeObject = RakeKeywordExtractor(additional_stopwords) newKeywordList = newRakeObject.extract(content) for keywords in newKeywordList: print keywords
def main(): # SET N AS THE NUMBER OF KEYWORDS TO EVALUATE FROM THE KEYWORDS LISTS N = 243 base_dir = os.path.dirname(os.path.realpath(__file__)) stopwords_simple = os.path.join(base_dir,'stopwords_nltk.txt') rake = RakeKeywordExtractor(stopwords_simple) tp_total = 0 fp_total = 0 fn_total = 0 corpus = os.path.join(base_dir,'./corpus') txtfiles = [file for file in os.listdir(corpus) if file.endswith('.txt')] s="" for txtfile in txtfiles: try: keyfile = os.path.join(corpus,txtfile).replace('.txt', '.key') content = open(os.path.join(corpus,txtfile), 'r').read().decode('utf-8') keywordsExtracted = set(rake.extract(content, incl_scores=False)[0:N]) keywordsExpected = set(listfromfilelines(keyfile)[0:N]) tp, fp, fn = confusionMatrix(keywordsExtracted, keywordsExpected); p, r, f1 = getF1(tp, fp, fn) tp_total += tp fp_total += fp fn_total += fn print "F1 for top " + str(N) + " keywords in " + txtfile + ":\t" + str(f1) s="" for x in keywordsExtracted: s = s + x +" " wordcloud = WordCloud().generate(s) plt.imshow(wordcloud) plt.axis('off') plt.show() raw_input(">") s="" for x in keywordsExpected: s = s + x +" " wordcloud = WordCloud().generate(s) plt.imshow(wordcloud) plt.axis('off') plt.show() raw_input(">") s=content wordcloud = WordCloud().generate(s) plt.imshow(wordcloud) plt.axis('off') plt.show() raw_input(">") except Exception,err: print Exception,err #COMMENT NEXT LINES IN FOR DEBUGGING """print "Extracted Keywords:"
for i in range(0,NUM): keywords = keywordList[i][0].split(' ') length = len(keywords) for word in keywords: keyword_freq.setdefault(word,0) keyword_freq[word] = keyword_freq[word] + 1 return keyword_freq #f = open(sys.argv[1],'r') f = codecs.open(sys.argv[1],'r',"iso8859-15") content = f.read() content = content.encode('ascii','ignore') content = re.sub(r"[1-9][0-9]*\.?[0-9]*",'',content) rakeObject = RakeKeywordExtractor(set([])) keywordList = rakeObject.extract(content,True) words = nltk.word_tokenize(content) content_lemmatized = "" words = list(map(lambda x: lemma_obj.lemmatize(x),words)) content_lemmatized = ' '.join(words) content = content_lemmatized freq = {} freq_dist = nltk.FreqDist(words) keyword_freq=getKeywordFrequency(keywordList) adjacency_freq = getAdjacencyFrequency(keywordList,content,words) sortedFreqList = sorted(freq_dist.items(), key = lambda x: x[1],reverse=True) additional_stopwords=[] for key in sortedFreqList: keyword_freq.setdefault(key[0],0)
).join( Portal, Peticion.folioPeticion == Portal.folioSAC).join( Temas, Peticion.tema_id == Temas.temaId ).join( Dependencia, Peticion.dependencia_id == Dependencia.dependenciaId ) kw = {} temas = {} for pet in peticion_list: tema_pet = pet.Temas.nomTema if tema_pet in temas: temas[tema_pet] = temas[tema_pet] + 1 else: temas[tema_pet] = 1 rake = RakeKeywordExtractor() kw_tmp = rake.extract(pet.Portal.descripcion) for word in list(kw_tmp): if word not in kw: kw[word] = 1 else: kw[word] += 1 mylist = sorted(temas.items(), key=lambda x:x[1], reverse=True) for item in mylist: print('"'+item[0]+'": {"peticiones_atendidas": '+str(item[1])+', "palabras_clave": "'+tema_keyword[item[0]]+'"},')
querySelectAnswerTag = "SELECT ID FROM SurveyAnswer_Tags WHERE SurveyAnswerID = %d AND SurveyAnswerTagID = %d" queryDeleteFormerTags = "DELETE FROM SurveyAnswer_Tags WHERE SurveyAnswerID = %d AND SurveyAnswerTagID IN (SELECT ID FROM SurveyAnswerTag where Type = 'AUTOMATIC')" config = DBConfig(working_dir+"/db.ini").read_db_config() cursor = None try: # Open database connection db = MySQLdb.connect(**config) # prepare a cursor object using cursor() method cursor = db.cursor() data = sql.read_sql(queryAnswers % question_id, db) for idx, answer in enumerate(data['Value'].tolist()): keywords = rake.extract(answer) answerId = data['ID'].values[idx] if must_delete_former_automatic_tags > 0: cursor.execute(queryDeleteFormerTags % (answerId)) # tag processing for idx2, tag in enumerate(keywords[:max_tags]): tag = re.sub(r"[^\w\d]", "", tag) if len(tag) > 2: cursor.execute(queryTag % (tag)) dbTag = cursor.fetchone() tagId = None if dbTag is None: res = cursor.execute(queryInsertTag % (tag)) tagId = cursor.lastrowid else: tagId = dbTag[0]