def rakeUP():

    outputFile = 'oriList.csv'
    inputFile = 'ScrapedData.txt'

    rake_object = rake.Rake("rake/SmartStoplist.txt", 3, 1, 1)
    sample_file = open(inputFile, 'r')
    text = sample_file.read()
    keywords = rake_object.run(text)
    target = open(outputFile, 'w')
    for i in range(0, len(keywords)):
        #print keywords[i][0]
        target.write(re.sub('[^a-zA-Z]+', '', keywords[i][0]))
        target.write('\n')
    print 'written output to ', outputFile
def genParSeeds(text, numSeeds=10, maxWords=3):
    """ Returns lists of 'seeds', one per paragraph.
		args:
			text		 - text input (article)
			numSeeds - number of keywords you want to return
			maxWords - maximum phrase length. default means seeds will be
						 phrases of 3 words or less
			returns:
				list of lists of strings
	"""
    minChars = 5
    minFreq = 1

    paragraphs = text.split("\n")
    rake_obj = rake.Rake("rake/SmartStopList.txt", minChars, maxWords, minFreq)
    return [genSeeds(p, numSeeds, maxWords, rake_obj) for p in paragraphs]
示例#3
0
    def __init__(self):

        # Create Classes
        self.catz = mcategorizerClass.Categorize()
        self.cats = categorizerClass.Categorize()
        # Init Prep List
        self.Code = r"\\[a-zA-Z0-9]+"
        self.ReList = [
            Url_RE, Entity, Timelike, NumNum, NumberWithCommas, self.Code,
            Punct, Separators, Decorations
        ]
        self.stoplistw = stopwords.words('english')
        self.stoplist = [
            ")", "(", ".", "'", ",", ";", ":", "?", "/", "!", "@", "$", "*",
            "+", "-", "_", "=", "&", "%", "`", "~", "\"", "{", "}"
        ]
        #Load Rake
        self.rake = rake.Rake(path + "/rake/SmartStoplist.txt")
示例#4
0
def genSeeds(text, numSeeds=10, maxWords=3, rake_obj=None):
	""" Returns a list of 'seeds', or keywords from text input.
		args:
			text     - text input (article)
			numSeeds - number of keywords you want to return
			maxWords - maximum phrase length. default means seeds will be
					   phrases of 3 words or less
		    rake_obj - optional RAKE object
	    returns:
	    	list of strings
	"""
	minChars = 5
	minFreq  = 3

	if not rake_obj:
		rake_obj = rake.Rake("rake/SmartStopList.txt", minChars, maxWords, minFreq)

	keywords = rake_obj.run(text)
	return [s for s, _ in keywords[:numSeeds]]
示例#5
0
import dateutil.parser
import feedparser
import html
import inspect
import math
import networkx
import nltk
import os
import pytz
import re
import sys

env = {}

# For RAKE.
raker = rake.Rake('rake/SmartStoplist.txt', 5, 3)

class NewsSource(object):
    def __init__(self, name, link, filecache=None, titleRe=None, summaryRe=None):
        self.name = name
        self.link = link
        self.filecache = filecache
        self.titleRe = titleRe
        self.summaryRe = summaryRe

class NewsItem(object):
    def __init__(self, source, entry):
        # Ordinary properties.
        self.link = entry.link
        self.thumbnails = entry.media_thumbnail if 'media_thumbnail' in entry.keys() else []
        self.source = source
示例#6
0
import rake.rake as rake
import operator

#data/docs/fao_test/w2167e.txt

rake_obj_uni = rake.Rake("tamilStopList.txt", 6, 1, 2)
rake_obj_bi = rake.Rake("tamilStopList.txt", 8, 2, 1)
rake_obj_tri = rake.Rake("tamilStopList.txt", 12, 3, 1)
'''
***use RAKE to extract keywords
***i/p: sentences i arraylist
***o/p: 2d array with string and score
'''


def getKeys(sent):
    bi = []
    tri = []
    uni = []
    keys = {}

    #get unigram and scores
    uni = rake_obj_uni.run(sent)[0:20]

    #get bigram and scores
    bi = rake_obj_bi.run(sent)[0:10]

    #get trigram and scores
    tri = rake_obj_tri.run(sent)[0:10]

    keys['uni'] = uni
示例#7
0
 def post(self, request):
     dic = json.loads(request.body)
     ret = rake.Rake().run(dic["text"])
     ret = filter(lambda x: len(x.split(" ")) > 1, map(lambda x: x[0], ret))
     ret = {"keywords": list(ret)}
     return HttpResponse(json.dumps(ret))