def terms(url): terms = {} html = requests.get(url) content = html.content.decode("utf-8") soup = BeautifulSoup(content) #print soup.get_text() ''' for script in soup(['script','style']): script.extract text=soup.get_text().decode("utf-8") print(text) ''' [ s.extract() for s in soup( ['style', 'script', '[document]', 'head', 'title', 'select']) ] visible_text = soup.getText() #print soup.getText() print visible_text.decode f = open('haha4.txt', 'w') for i in visible_text: f.write(i.encode('utf-8')) f.close() tagger = tag.Tagger('english') tagger.initialize() # create the extractor with the tagger extractor = extract.TermExtractor(tagger=tagger) # invoke tagging the text patt = "((?: [\x00-\x7F] | [\xC0-\xDF][\x80-\xBF] | [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF7][\x80-\xBF]{3}){1,100})" s = nltk.data.load('haha4.txt', format='raw').lower() re.sub(patt, '', s) extractor.tagger(s) # extract all the terms, even the "weak" ones extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1) # extract print extractor(s) result = [] for ss in extractor(s): #print ss[0] for i in ss[0].split(" "): for j in i.split("-"): if not j in result: result.append(j) print result with open("words.txt", "a") as myfile: for i in result: myfile.write(i + "\n") return result
def extract_keywords(doc, lower=False): extractor = extract.TermExtractor() extractor.filter = extract.DefaultFilter() keywords_list = [] keywords = extractor(doc) for keyword in keywords: if lower == True: keywords_list.append(keyword[0].lower()) else: keywords_list.append(keyword[0]) return keywords_list
def keyterms(text, language='english'): # initialize the tagger with the required language tagger = tag.Tagger(language) tagger.initialize() # create the extractor with the tagger extractor = extract.TermExtractor(tagger=tagger) # invoke tagging the text # s = nltk.data.load('corpora/operating/td1.txt',format = 'raw') extractor.tagger(text) # extract all the terms, even the "weak" ones extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1) # extract return extractor(text)
def POST(self): import sys import re import simplejson as json from topia.termextract import extract extractor = extract.TermExtractor() #extractor.filter = extract.permissiveFilter extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1) def term_compare(x, y): if y[1] + y[2] * 2 > x[1] + x[2] * 2: return 1 elif y[1] == x[1] and y[2] == x[2]: return 0 else: # x<y return -1 input = web.input(callback=None) content = input.context.lower() content = content.replace(u"\u201c", '"').replace(u"\u201d", '"').replace( u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u2026", "") list = sorted(extractor(content), term_compare) list = list[:50] for i in range(len(list) - 1, -1, -1): if len(list[i][0]) == 1 or list[i][2] > 2 or ( list[i][0].find("http") >= 0) or not re.search( '[a-z]', list[i][0]) or re.search('[0-9]', list[i][0]): list.remove(list[i]) else: # prepend /tags/ to match expected input on server list[i] = list[i][0].strip() callback = input.callback pattern = r'[^a-zA-Z0-9 ]' for i in range(len(list) - 1, -1, -1): if re.search(pattern, list[i]): list.remove(list[i]) if (len(sys.argv) > 2): length = int(sys.argv[2]) if (len(list) > length): list = list[:length] list = json.dumps(list, indent=4) if callback and re.match('^[a-zA-Z0-9._\[\]]+$', callback): return callback + '(' + list + ')' else: return list
def terms(url): terms = {} url = "http://www." + url html = requests.get(url) content = html.content.decode("utf-8") soup = BeautifulSoup(content, "lxml") ''' for script in soup(['script','style']): script.extract text=soup.get_text().decode("utf-8") print(text) ''' [ s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title']) ] visible_text = soup.getText() #print visible_text.decode f = open('haha4.txt', 'w') f2 = open('keys', 'a') for i in visible_text: f.write(i.encode('utf-8')) if not i in terms: terms[i] = 1 else: terms[i] = terms[i] + 1 #print "yees" pickle.dump(terms, f2) f2.close() f.close() tagger = tag.Tagger('english') tagger.initialize() # create the extractor with the tagger extractor = extract.TermExtractor(tagger=tagger) # invoke tagging the text s = nltk.data.load('haha4.txt', format='raw') extractor.tagger(s) # extract all the terms, even the &quot;weak&quot; ones extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1) # extract #print extractor(s) return terms
def __init__(self): self.extractor = extract.TermExtractor() self.extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=3)
def parser(path='', blurb='', scale=3, minoccur=2, omitscores=False, boostphrases=False, size=0, raw=False): size = int(request.values.get('size', size)) minoccur = int(request.values.get('minoccur', minoccur)) scale = int(request.values.get('scale', scale)) if 'omitscores' in request.values and request.values['omitscores'].lower( ) not in ['false', '0', 'no']: omitscores = True if 'boostphrases' in request.values and request.values[ 'boostphrases'].lower() not in ['false', '0', 'no']: boostphrases = True extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=minoccur) if path: url = path else: url = request.values.get('url', '') if url and not url.startswith('http://') and not url.startswith( 'https://'): url = 'http://' + url if url: #try: # NOTE: GETTING THIS TO WORK PIP INSTALL SPYNNER WHICH REQUIRES # A BUNCH OF OTHER STUFF (I AM NOT SURE EXACTLY WHICH YET): # sudo apt-get install libx11 libx11-dev xvfb libxtst-dev libpng-dev # https://github.com/makinacorpus/spynner/#dependencies # still fails on importing PyQt4 which is on my system but can't # get it into my virtual env. Rely on non-js page content for now. #import spynner #browser = spynner.Browser() #browser.create_webview(True) #browser.load(url, load_timeout=60) #c = browser._get_html() #browser.close() #except: loc = app.config.get('BASE_URL', 'http://cottagelabs.com') if url.startswith(loc): lloc = url.replace(loc, '') if lloc == '': lloc = '/index' rec = models.Pages().pull_by_url(lloc) if rec is not None: c = rec.data.get('content', '') else: c = '' else: g = requests.get(url) c = g.text else: c = request.values.get('blurb', blurb) if c: content = _html_text(c) terms = extractor(content) result = {} for t, o, l in terms: if boostphrases: ct = o + l else: ct = o result[t.lower()] = ct * scale res = [{ "term": i[0], "score": i[1] } for i in sorted(result.items(), key=lambda x: x[1], reverse=True) if len(i[0].replace(' ', '')) > 2 and i[0].replace( ' ', '') not in url and i[0][0] in string.lowercase + string.uppercase and i[0][len(i[0]) - 1] in string.lowercase + string.uppercase] if omitscores: res = [i[0] for i in res] if size is not 0: res = res[:size] else: res = [] if raw: return res else: resp = make_response(json.dumps(res)) resp.mimetype = "application/json" return resp
#coding=utf-8 from bs4 import BeautifulSoup import html2text import requests from topia.termextract import extract from topia.termextract import tag tagger = tag.Tagger('english') tagger.initialize() # create the extractor with the tagger extractor = extract.TermExtractor(tagger=tagger) extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=3) url = "http://habrahabr.ru" doc = requests.get(url).content doc = BeautifulSoup(doc, from_encoding="utf-8") [s.extract() for s in doc(u'script')] [s.extract() for s in doc(u'style')] doc = doc.get_text() #doc = html2text.html2text(doc) kw = extractor(doc) ''' test: huify(u'эту неделю').encode('utf-8') == u'эту xyеделю' ''' def huify(expr): word = expr if expr.rfind(' ') > -1: word = expr[expr.rindex(' ') + 1:] vowels = set(u'aeiouyаеиоуыяюё') mz = len(word) for vowel in vowels:
import os ''' Spark test script - How to run: spark-submit --master local[4] test.py - contains examples to learn how to use the Spark Python API ''' stop_words = get_stop_words('english') stop_words.append("https ://t") extractor = extract.TermExtractor() extractor.filter = extract.DefaultFilter() if __name__ == "__main__": # command line arguments? # Spark configuration conf = SparkConf().setAppName("Tweet Data by City") sc = CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) # read files rawTweets = sqlContext.read.json("../tweets.json") # split the data into two sets # rawTweets.geo is of type 'Column', which has the isNotNull() and isNull() functions to check for nullity geoTweets = rawTweets.filter(rawTweets.geo.isNotNull()) placeTweets = rawTweets.filter(rawTweets.geo.isNull())
def extract_keyword(text): extractor = extract.TermExtractor() extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2, noLimitStrength=65535) keywords = sorted(extractor(text)) return keywords