def rakeUP(): outputFile = 'oriList.csv' inputFile = 'ScrapedData.txt' rake_object = rake.Rake("rake/SmartStoplist.txt", 3, 1, 1) sample_file = open(inputFile, 'r') text = sample_file.read() keywords = rake_object.run(text) target = open(outputFile, 'w') for i in range(0, len(keywords)): #print keywords[i][0] target.write(re.sub('[^a-zA-Z]+', '', keywords[i][0])) target.write('\n') print 'written output to ', outputFile
def genParSeeds(text, numSeeds=10, maxWords=3): """ Returns lists of 'seeds', one per paragraph. args: text - text input (article) numSeeds - number of keywords you want to return maxWords - maximum phrase length. default means seeds will be phrases of 3 words or less returns: list of lists of strings """ minChars = 5 minFreq = 1 paragraphs = text.split("\n") rake_obj = rake.Rake("rake/SmartStopList.txt", minChars, maxWords, minFreq) return [genSeeds(p, numSeeds, maxWords, rake_obj) for p in paragraphs]
def __init__(self): # Create Classes self.catz = mcategorizerClass.Categorize() self.cats = categorizerClass.Categorize() # Init Prep List self.Code = r"\\[a-zA-Z0-9]+" self.ReList = [ Url_RE, Entity, Timelike, NumNum, NumberWithCommas, self.Code, Punct, Separators, Decorations ] self.stoplistw = stopwords.words('english') self.stoplist = [ ")", "(", ".", "'", ",", ";", ":", "?", "/", "!", "@", "$", "*", "+", "-", "_", "=", "&", "%", "`", "~", "\"", "{", "}" ] #Load Rake self.rake = rake.Rake(path + "/rake/SmartStoplist.txt")
def genSeeds(text, numSeeds=10, maxWords=3, rake_obj=None): """ Returns a list of 'seeds', or keywords from text input. args: text - text input (article) numSeeds - number of keywords you want to return maxWords - maximum phrase length. default means seeds will be phrases of 3 words or less rake_obj - optional RAKE object returns: list of strings """ minChars = 5 minFreq = 3 if not rake_obj: rake_obj = rake.Rake("rake/SmartStopList.txt", minChars, maxWords, minFreq) keywords = rake_obj.run(text) return [s for s, _ in keywords[:numSeeds]]
import dateutil.parser import feedparser import html import inspect import math import networkx import nltk import os import pytz import re import sys env = {} # For RAKE. raker = rake.Rake('rake/SmartStoplist.txt', 5, 3) class NewsSource(object): def __init__(self, name, link, filecache=None, titleRe=None, summaryRe=None): self.name = name self.link = link self.filecache = filecache self.titleRe = titleRe self.summaryRe = summaryRe class NewsItem(object): def __init__(self, source, entry): # Ordinary properties. self.link = entry.link self.thumbnails = entry.media_thumbnail if 'media_thumbnail' in entry.keys() else [] self.source = source
import rake.rake as rake import operator #data/docs/fao_test/w2167e.txt rake_obj_uni = rake.Rake("tamilStopList.txt", 6, 1, 2) rake_obj_bi = rake.Rake("tamilStopList.txt", 8, 2, 1) rake_obj_tri = rake.Rake("tamilStopList.txt", 12, 3, 1) ''' ***use RAKE to extract keywords ***i/p: sentences i arraylist ***o/p: 2d array with string and score ''' def getKeys(sent): bi = [] tri = [] uni = [] keys = {} #get unigram and scores uni = rake_obj_uni.run(sent)[0:20] #get bigram and scores bi = rake_obj_bi.run(sent)[0:10] #get trigram and scores tri = rake_obj_tri.run(sent)[0:10] keys['uni'] = uni
def post(self, request): dic = json.loads(request.body) ret = rake.Rake().run(dic["text"]) ret = filter(lambda x: len(x.split(" ")) > 1, map(lambda x: x[0], ret)) ret = {"keywords": list(ret)} return HttpResponse(json.dumps(ret))