def giveKeyword(text): from bs4 import BeautifulSoup text = BeautifulSoup(text, "lxml").get_text() from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') text = tokenizer.tokenize(text) #print text cleanText = "".join([ " " + i if not i.startswith("'") and i not in string.punctuation else i for i in text ]).strip() rakeobj = Rake("SmartStoplist.txt") keywords = rakeobj.run(cleanText) output = "" #take top two highest ranked (if there even are two) for keyword in keywords[0:2]: output = output + keyword[0] + "," return output #the highest ranked one...
def rake_extract_concepts(text, exclude, number): r = Rake('/usr/share/postgresql/10/tsearch_data/english.stop') concepts = r.run(text, minCharacters=2, maxWords=4, minFrequency=3) count = 0 for keyword, weight in concepts: if weight > 1 and count < number and keyword not in exclude: yield keyword, weight count += 1
def rake(text, top_n=10): # Clean the text from non-printable characters. text = ''.join(word for word in text if word in printable) # Uses all english stopwords and punctuation from NLTK. r = Rake(NLTKStopList()) return [keyphrase for (keyphrase, score) in r.run(text)[:top_n]]
class KWGet(Driver): def setup(self): path = os.path.join(os.path.dirname(__file__), STOPWORDS) self._rake = Rake(path) def _get_keywords(self, text): buf = b64decode(text) keywords = self._rake.run(buf) if PRINT: print('KWGet: keywords=%s' % str(keywords)) return keywords @wrapper def put(self, *args, **kwargs): text = kwargs.get('content') if text: keywords = self._get_keywords(text) if keywords: return {'keywords':keywords}
class KWGet(Driver): def setup(self): path = os.path.join(os.path.dirname(__file__), STOPWORDS) self._rake = Rake(path) def _get_keywords(self, text): buf = b64decode(text) keywords = self._rake.run(buf) if PRINT: print('KWGet: keywords=%s' % str(keywords)) return keywords @wrapper def put(self, *args, **kwargs): text = kwargs.get('content') if text: keywords = self._get_keywords(text) if keywords: return {'keywords': keywords}
def giveKeyword(text): from bs4 import BeautifulSoup text = BeautifulSoup(text,"lxml").get_text() from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') text = tokenizer.tokenize(text) #print text cleanText = "".join([" " + i if not i.startswith("'") and i not in string.punctuation else i for i in text]).strip() rakeobj = Rake("SmartStoplist.txt") keywords = rakeobj.run(cleanText) output = "" #take top two highest ranked (if there even are two) for keyword in keywords[0:2]: output = output + keyword[0] + ","; return output #the highest ranked one...
class Analyzer: '''analyzes text for sentiment and important terms''' def __init__(self): self.filter = Rake('stoplist.txt') def sentiment(self, text): '''transform sentiment into trinary value''' polarity = TextBlob(text).sentiment.polarity if polarity > 0.2: sentiment = "1" elif polarity < -0.2: sentiment = "-1" else: sentiment = "0" return sentiment def terms(self, text): '''get most important terms from text''' if text: terms = self.filter.run(text)[0][0] else: terms = '' return terms
def get_doc_keywords(html, articleDom): ''' Search meta keyword tag for any predefined keywords Else use RAKE library to extract keywords from document content Return first five keywords ''' tree = fromstring(html) keywords = tree.xpath('//meta[@name="keywords"]/@content') if keywords: arr = keywords.split(',')[:5] # return first five keywords return [x.strip(' ') for x in arr] else: # Use RAKE to extract keywords from article contetnt from RAKE import Rake import operator node = fromstring(articleDom) text = node.text_content() extractor = Rake( "RAKE/stoplists/SmartStoplist.txt", 3, 3, 5) # min 3 chars, max 3 words, word appears min 5 times keywords = [x[0] for x in extractor.run(text)] keywords = keywords[:5] # get top five return [x.strip(' ') for x in keywords]
def get_keyword(text): rake = Rake("SmartStoplist.txt") if text == "": return "" keywords = rake.run(text) return keywords[0][0]
def generate_titles(file_name, random=False, use_rake=False, use_summa_text_rank=False, use_text_rank=False): logger.info("Opening file") text_file = open(file_name) logger.info("Reading file") raw_text = text_file.read().lower() # Remove Unicode characters. raw_text = raw_text.decode('unicode_escape').encode('ascii', 'ignore') #Convert raw text to word tokens logger.info("Tokenizing") tokens = nltk.word_tokenize(raw_text.translate(None, string.punctuation)) #Remove stopwords logger.info("Removing stopwords") stop_words = set(stopwords.words('english')) #NOTE: we need to include some more stopwords, as 'english' doesn't contain some stopwords # related to journal articles (e.g., "et" and "al" in "et al.") stop_words.update(ADDITIONAL_STOPWORDS) filtered_text = [word for word in tokens if word not in stop_words] #Create Corpus object for input text logger.info("Creating corpus object") input_text = Corpus(raw_text, tokens, filtered_text) input_text.stop_words = stop_words logger.info("Filtered words to use") logger.info("\t %s" % input_text.filtered_tokens[:5]) #NOTE: stopwords are removed before POS tags assigned, this could # potentially degrade POS tagging performance - may want to # switch this order #Demonstrate functions logger.info("Getting POS tags") input_text.pos_tags = pos_tagger(input_text) logger.info("\t %s" % input_text.pos_tags[:5]) logger.info("Finding all used parts of speech.") input_text.used_pos = set([tag[1] for tag in input_text.pos_tags]) logger.info(input_text.used_pos) logger.info("Getting stemmed words") input_text.stemmed_words = stem_tokens(input_text) logger.info("\t %s" % input_text.stemmed_words[:5]) # split the stemmed words into ~equal-sized groups logger.info("Splitting the stemmed words into groups") #logger.info("There are %s words in this group" % len(input_text.stemmed_words)) num_splits = 2 input_text.splits = split_tokens(input_text, num_splits) #for s in input_text.splits: # logger.info("%s %s\n\n" % (s,len(s))) logger.info("Getting word frequency and proximity") cutoff = 0.125 if len(input_text.filtered_tokens) < 250: cutoff = 0.35 #33 input_text.word_freq_proximity = stems_frequency_proximity( input_text, cutoff) #logger.info("\t %s" % (input_text.word_freq_proximity[u'becom'],)) logger.info("Mapping filtered words and their stemmed forms") input_text.filtered_word_and_bases, input_text.filtered_bases_and_words = stems_and_bases( input_text) #logger.info("\t %s" % input_text.filtered_word_and_bases[u'becom']) logger.info("Mapping POS tags and words") input_text.pos_tag_and_words = pos_tags_and_words(input_text) #logger.info("\t %s" % input_text.pos_tag_and_words['NNS'][:5]) logger.info("------ End Processing ------\n\n") ########################## if use_rake: logger.info("------ Begin Rake ------") """More information at: https://github.com/fabianvf/python-rake""" r = Rake(RAKE.SmartStopList()) #stop_words_list) sorted_keywords = r.run(input_text.raw_text) logger.info("Sorted keywords: %s" % sorted_keywords[:5]) logger.info("------ End Rake ------\n\n") if use_summa_text_rank: logger.info("------ Begin SummaTextRank ------") """More information at https://github.com/summanlp/textrank""" logger.info("Sentence(s) summary: %s" % summarizer.summarize(raw_text)) logger.info("Keywords: %s" % keywords.keywords(raw_text)) logger.info("------ End SummaTextRank ------\n\n") if use_text_rank: logger.info("------ Begin TextRank ------") """More information at https://github.com/davidadamojr/TextRank""" logger.info("Sentence(s) summary: %s " % textrank.extract_sentences(raw_text)) logger.info("Keywords: %s" % textrank.extract_key_phrases(raw_text)) logger.info("------ End TextRank ------\n\n") ########################## logger.info("------ Begin Weighting ------") logger.info("Calculating word weights") input_text.word_weights = get_word_weights(input_text, random) logger.info("Printing word weights") weight_thresh = -1 print_words_with_weight_above(weight_thresh, input_text.word_weights, input_text) logger.info("------ End Weighting ------\n\n") ########################## logger.info("------ Begin Building ------") titles = build_titles(input_text) logger.info("------ End Building ------\n\n") ########################## logger.info("Closing file") text_file.close() ########################## logger.info("------ Begin Ranking ------") #NOTE: the scores denote the title rankings relative to one another # 1 denotes the title with the highest rank and 0 denotes the # title with the lowest rank (determined by a combination of # summed word weights and average word weight) titles_ranked = order_titles(titles, input_text) logger.info("------ End Ranking ------\n\n") ########################## return titles_ranked
import csv from RAKE import Rake import email import os base = os.getcwd() + '/Data_set' topic_identifier_instance = Rake('Stop_list.txt') for folder in os.listdir(base + '/Data_set'): fold = base + '/Data_set/{}'.format(folder) for file in os.listdir(fold): mail = email.message_from_file(open(fold + '/{}'.format(file))) message_string = mail.get_payload() score_table = topic_identifier_instance.run(message_string) score_table.insert(0, ('topic', 'word_score')) print(score_table) writer = csv.writer( open( base + '/Result_data_set' + '/' + folder + '/' + file + '.csv', 'w+')) for row in score_table: writer.writerow(row)