def debug_url(): """ API route for URL debug """ # TODO: have a quota per ip on this API to prevent abuse url = request.args.get("url") # Special case for local files if url.startswith("tests/") and config["ENV"] == "local": with open(url, "rb") as f: cnt = f.read() headers = {} else: if not url.startswith("http"): url = "http://" + url req = requests.get(url) cnt = req.content headers = dict(req.headers) doc = load_document_type("html", cnt, url=str(url), headers=headers) parsed = indexer.parse_document(doc) global_rank, ranking_signals = indexer.ranker.get_global_document_rank( doc, parsed["url_metadata"]) # URL class is not serializable links = [{ "href": link["href"].url, "words": link.get("words") } for link in doc.get_hyperlinks()] ret = { "url": parsed["url"].url, "word_groups": doc.get_word_groups(), "rank": global_rank, "title_raw": doc.get_title(), "title": parsed["title_formatted"], "summary": parsed["summary_formatted"], "langs": parsed["langs"], "links": links, "ranking_signals": ranking_signals } return json.dumps(ret)
def debug_url(): """ API route for URL debug """ # TODO: have a quota per ip on this API to prevent abuse url = request.args.get("url") # Special case for local files if url.startswith("tests/") and config["ENV"] == "local": with open(url, "rb") as f: cnt = f.read() headers = {} else: if not url.startswith("http"): url = "http://" + url req = requests.get(url) cnt = req.content headers = dict(req.headers) doc = load_document_type("html", cnt, url=str(url), headers=headers) parsed = indexer.parse_document(doc) global_rank, ranking_signals = indexer.ranker.get_global_document_rank(doc, parsed["url_metadata"]) # URL class is not serializable links = [{ "href": link["href"].url, "words": link.get("words") } for link in doc.get_hyperlinks()] ret = { "url": parsed["url"].url, "word_groups": doc.get_word_groups(), "rank": global_rank, "title_raw": doc.get_title(), "title": parsed["title_formatted"], "summary": parsed["summary_formatted"], "langs": parsed["langs"], "links": links, "ranking_signals": ranking_signals } return json.dumps(ret)
def iter_documents(self, partition): """ Yields *Document objects from one partition. This is the main public method for Sources """ i = 0 maxdocs = int(self.args.get("maxdocs") or 0) for url, headers, document_type, index_level, body in self.iter_items( partition): document = load_document_type(document_type, body, url=url, headers=headers, index_level=index_level) yield document i += 1 if i >= maxdocs > 0: return
def iter_documents(self): """ Yields *Document objects. This is the main public method for Sources """ i = 0 maxdocs = int(self.args.get("maxdocs") or 9999999999) for url, headers, document_type, index_level, body in self.iter_items(): document = load_document_type( document_type, body, url=url, headers=headers, index_level=index_level ) yield document i += 1 if i > maxdocs: return