def article(article_id): """Retrieve a sanitized article. Request ------- :: GET /44d85795-248d-5899-b8ca-ac2bd8233755 Response -------- .. note:: The following is formatted for readability and does not match the actual response from the API. Also, the body parameter has been shortened to fit this example more concisely. :: HTTP/1.0 200 Ok { "body": "…Singularity, an Alternative Openstack Guest Agent | Hackery &c… "url": "http://blog.alunduil.com/posts/singularity-an-alternative-openstack-guest-agent.html", "created_at": {"$date": 1374007667571}, "etag": "6e2f69536ca15cc18260bffe7583b849", "_id": "03db19bb92205b4fb5fc3c4c0e4b1279", "parsed_at": {"$date": 1374008521414}, "size": 9964 } """ article = get_collection("articles").find_one({ "_id": uuid.UUID(article_id).hex }) logger.debug("article: %s", article) if article is None or "etag" not in article: # 404 not only if the object doesn't exist but also if we haven't # sanitized the body yet. abort(404) container_name, object_name = article.pop("text_container_name"), article.pop("text_object_name") logger.debug("article: %s", article) # TODO Catch connection issues and return Temporarily Unavailable. if request.method != "HEAD": data = get_container(container_name).get_object(object_name).fetch() logger.debug("type(data): %s", type(data)) logger.debug("len(data): %s", len(data)) article["body"] = data response = make_response(json.dumps(article, default = json_util.default), 200) response.mimetype = "application/json" response.headers["Access-Control-Allow-Origin"] = Parameters()["server.domain"] return response
def sanitize_html_consumer(channel, method, header, body): """Download and sanitize the HTML for the given article. The HTML should be simplified as much as possible without modifying the feel of the structure to someone reading the content of the body of the document. .. note:: Analysis will be necessary that shows the statistics on sanitized HTML size for a determination as to whether we can store it inline in Mongo or out of band in an object store like Rackspace Cloud Files. The decisions and algorithms used for streamlining the HTML are not proprietary in any way and can be used and modified under the terms of this file's licensing but more importantly can be improved or modified if imperfections are found. """ _id = json.loads(body)["_id"] logger.debug("article._id: %s", _id) articles = get_collection("articles") article = articles.find_one({ "_id": _id }, { "_id": 0 }) request = urllib2.Request(article["url"]) request.get_method = lambda: "HEAD" response = urllib2.urlopen(request) logger.debug("response: %s", response) logger.debug("response.info(): %s", response.info()) logger.debug("response.info().__class__: %s", response.info().__class__) etag = response.info().getheader("etag") # TODO Check Last-Modified? # TODO Use expires to set the next poll? # TODO Respect Cache-Control? # TODO Other header considerations. # TODO Use Content-Type to set encoding? if article.get("etag") != etag: logger.info("Parsing full HTML of %s", article["url"]) article["etag"] = etag response = urllib2.urlopen(article["url"]) soup = bs4.BeautifulSoup(response.read()) # TODO Use this when more is required: #html = sanitize(soup) html = soup.get_text() article["parsed_at"] = datetime.datetime.now() logger.debug("HTML Size: %s B", sys.getsizeof(html)) article["size"] = sys.getsizeof(html) container_part, object_part = str(uuid.UUID(_id)).split("-", 1) article["text_container_name"] = "margarine-" + container_part article["text_object_name"] = object_part logger.info("Uploading text to cloudfiles") get_container(article["text_container_name"]).store_object(article["text_object_name"], html, content_type = "text/html") logger.info("Uploaded text to cloudfiles") articles.update({ "_id": _id }, { "$set": article }, upsert = True) logger.info("finished processing article: %s", article["url"]) channel.basic_ack(delivery_tag = method.delivery_tag)