def keywords_from_url(): """ URL : /keywords_from_url Extract keywords from the text content of a web page. Method : POST Form data : - url : the url to analyze [string, required] - hits : limit number of keywords returned [int, optional, 100 by default] Return a JSON dictionary : {"keywords":[list of keywords]} """ # get POST data and load language resources data = dict((key, request.form.get(key)) for key in request.form.keys()) if "url" not in data : raise InvalidUsage('No url specified in POST data') # crawl url, detect main language and get main text from url url_data = url.crawl(data["url"]) if not url_data : raise InvalidUsage('No content to analyze') text_content = url.extract_content(url_data.text) # analyze text and extract keywords keywords = language.keyword_mining(text_content) # limit the number of keywords total = len(keywords) hits = int(data.get("hits", 100)) keywords = [kw for kw, score in keywords.most_common(hits)] return jsonify(keywords=keywords, total=total)
def index_job(link): """ Index a single page. """ print("index page : %s" % link) # get final url after possible redictions try: link = url.crawl(link).url except: return 0 process = CrawlerProcess({ 'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36", 'DOWNLOAD_TIMEOUT': 100, 'REDIRECT_ENABLED': False, 'SPIDER_MIDDLEWARES': { 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': True } }) process.crawl(crawler.SingleSpider, start_urls=[ link, ], es_client=client, redis_conn=redis_conn) process.start() # block until finished
def explore_job(link): """ Explore a website and index all urls (redis-rq process). """ print("explore website at : %s" % link) # get final url after possible redictions try: link = url.crawl(link).url except: return 0 # create or update domain data domain = url.domain(link) res = client.index(index="web", doc_type='domain', id=domain, body={ "homepage": link, "domain": domain, "last_crawl": datetime.now() }) # start crawler process = CrawlerProcess({ 'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36", 'DOWNLOAD_TIMEOUT': 100, 'DOWNLOAD_DELAY': 0.25, 'ROBOTSTXT_OBEY': True, 'HTTPCACHE_ENABLED': False, 'REDIRECT_ENABLED': False, 'SPIDER_MIDDLEWARES': { 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': True, 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': True, 'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': True, 'scrapy.extensions.closespider.CloseSpider': True }, 'CLOSESPIDER_PAGECOUNT': 500 #only for debug }) process.crawl(crawler.Crawler, allowed_domains=[urlparse(link).netloc], start_urls=[ link, ], es_client=client, redis_conn=redis_conn) process.start() return 1
def index(): """ URL : /index Index a new URL in search engine. Method : POST Form data : - url : the url to index [string, required] Return a success message. """ # get POST data data = dict((key, request.form.get(key)) for key in request.form.keys()) if "url" not in data: raise InvalidUsage('No url specified in POST data') # crawl url url_data = url.crawl(data["url"]) if not url_data: raise InvalidUsage("URL is invalid or has no text inside") # get main language of page lang = url.detect_language(url_data.text) if lang not in languages: raise InvalidUsage('Language not supported') # extract title of url title = url.extract_title(url_data.text) # extract description of url description = url.extract_description(url_data.text) # extract main content of url body = url.extract_content(url_data.text, languages.get(lang)) # index url and data res = client.index(index="web-%s" % lang, doc_type='page', id=data["url"], body={ "title": title, "description": description, "body": body, "url": data["url"] }) return "Success"
def reference_job(link, email) : """ Request the referencing of a website. """ print("referencing page %s with email %s"%(link,email)) # get final url after possible redictions try : link = url.crawl(link).url except : return 0 # create or update domain data domain = url.domain(link) res = client.index(index="web", doc_type='domain', id=domain, body={ "homepage":link, "domain":domain, "email":email }) return 1
def explore_job(link): """ Explore a website and index all urls (redis-rq process). """ logging.info("explore website at : %s" % link) try: link = url.crawl(link).url except: return 0 def f(q): try: """ __author__ : Bijin Benny __email__ : [email protected] __license__ : MIT __version__ : 1.0 Modification : The original code used CrawlerProcess class from scrapy library to crawl web pages. However, CrawlerProcess class could not run parallely in Redis tasks threads. CrawlerProcess was replaced by CrawlerRunner class that could run parallely in multiple Redis tasks """ runner = CrawlerRunner({ 'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \ (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36", 'DOWNLOAD_TIMEOUT': 100, 'DOWNLOAD_DELAY': 0.25, 'ROBOTSTXT_OBEY': True, 'HTTPCACHE_ENABLED': False, 'REDIRECT_ENABLED': False, 'SPIDER_MIDDLEWARES': { 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': True, 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': True, 'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': True, 'scrapy.extensions.closespider.CloseSpider': True }, 'CLOSESPIDER_PAGECOUNT': 500 #only for debug }) runner.crawl(crawler.Crawler, allowed_domains=[urlparse(link).netloc], start_urls=[ link, ], es_client=es, redis_conn=redis_conn) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run() q.put(None) except Exception as e: q.put(e) q = Q() p = Process(target=f, args=(q, )) p.start() result = q.get() p.join() if result is not None: raise result return 1