def pipeline(response, spider) : """ Index a page. """ # skip rss or atom urls if not response.css("html").extract_first() : return # get domain domain = url.domain(response.url) # extract title title = response.css('title::text').extract_first() title = title.strip() if title else "" # extract description description = response.css("meta[name=description]::attr(content)").extract_first() description = description.strip() if description else "" # get main language of page, and main content of page lang = url.detect_language(response.body) if lang not in languages : raise InvalidUsage('Language not supported') body, boilerplate = url.extract_content(response.body, languages.get(lang)) # weight of page weight = 3 if not title and not description : weight = 0 elif not title : weight = 1 elif not description : weight = 2 if body.count(" ") < boilerplate.count(" ") or not url.create_description(body) : # probably bad content quality weight -= 1 res = spider.es_client.index(index="web-%s"%lang, id=response.url, body={ "url":response.url, "domain":domain, "title":title, "description":description, "body":body, "weight":weight }) if response.status in spider.handle_httpstatus_list and 'Location' in response.headers: newurl = response.headers['Location'] meta = {'dont_redirect': True, "handle_httpstatus_list" : spider.handle_httpstatus_list} meta.update(response.request.meta) return Request(url = newurl.decode("utf8"), meta = meta, callback=spider.parse)
def index(): """ URL : /index Index a new URL in search engine. Method : POST Form data : - url : the url to index [string, required] Return a success message. """ # get POST data data = dict((key, request.form.get(key)) for key in request.form.keys()) if "url" not in data: raise InvalidUsage('No url specified in POST data') # crawl url url_data = url.crawl(data["url"]) if not url_data: raise InvalidUsage("URL is invalid or has no text inside") # get main language of page lang = url.detect_language(url_data.text) if lang not in languages: raise InvalidUsage('Language not supported') # extract title of url title = url.extract_title(url_data.text) # extract description of url description = url.extract_description(url_data.text) # extract main content of url body = url.extract_content(url_data.text, languages.get(lang)) # index url and data res = client.index(index="web-%s" % lang, doc_type='page', id=data["url"], body={ "title": title, "description": description, "body": body, "url": data["url"] }) return "Success"
def pipeline(response, spider) : """ Index a page. """ # skip rss or atom urls if not response.css("html").extract_first() : return # get domain domain = url.domain(response.url) # extract title title = response.css('title::text').extract_first() title = title.strip() if title else "" # extract description description = response.css("meta[name=description]::attr(content)").extract_first() description = description.strip() if description else "" # get main language of page, and main content of page lang = url.detect_language(response.body) if lang not in languages : raise InvalidUsage('Language not supported') body, boilerplate = url.extract_content(response.body, languages.get(lang)) # weight of page weight = 3 if not title and not description : weight = 0 elif not title : weight = 1 elif not description : weight = 2 if body.count(" ") < boilerplate.count(" ") or not url.create_description(body) : # probably bad content quality weight -= 1 # -- TEST -- # """keywords = Counter() text_for_keywords = "%s\n%s\n%s"%(title, description, bestbody) r = requests.post('http://localhost:5001/keywords_from_text', data = {'text':text_for_keywords}) data = r.json() #print(hit.url, data) for k in data["keywords"] : keywords[k] += 1 keywords = " ".join(["%s "%(kw)*score for kw, score in keywords.most_common(100)])""" # index url and data res = spider.es_client.index(index="web-%s"%lang, doc_type='page', id=response.url, body={ "url":response.url, "domain":domain, "title":title, "description":description, "body":body, "weight":weight }) # try to create thumbnail from page img_link = response.css("meta[property='og:image']::attr(content)").extract_first() if not img_link : img_link = response.css("meta[name='twitter:image']::attr(content)").extract_first() if img_link : q = Queue(connection=spider.redis_conn) q.enqueue(create_thumbnail, response.url, lang, img_link) # check for redirect url if response.status in spider.handle_httpstatus_list and 'Location' in response.headers: newurl = response.headers['Location'] meta = {'dont_redirect': True, "handle_httpstatus_list" : spider.handle_httpstatus_list} meta.update(response.request.meta) return Request(url = newurl.decode("utf8"), meta = meta, callback=spider.parse)