예제 #1
0
파일: server.py 프로젝트: x0rzkov/cosr-back
def debug_url():
    """ API route for URL debug """

    # TODO: have a quota per ip on this API to prevent abuse

    url = request.args.get("url")

    # Special case for local files
    if url.startswith("tests/") and config["ENV"] == "local":
        with open(url, "rb") as f:
            cnt = f.read()
            headers = {}
    else:

        if not url.startswith("http"):
            url = "http://" + url

        req = requests.get(url)
        cnt = req.content
        headers = dict(req.headers)

    doc = load_document_type("html", cnt, url=str(url), headers=headers)

    parsed = indexer.parse_document(doc)

    global_rank, ranking_signals = indexer.ranker.get_global_document_rank(
        doc, parsed["url_metadata"])

    # URL class is not serializable
    links = [{
        "href": link["href"].url,
        "words": link.get("words")
    } for link in doc.get_hyperlinks()]

    ret = {
        "url": parsed["url"].url,
        "word_groups": doc.get_word_groups(),
        "rank": global_rank,
        "title_raw": doc.get_title(),
        "title": parsed["title_formatted"],
        "summary": parsed["summary_formatted"],
        "langs": parsed["langs"],
        "links": links,
        "ranking_signals": ranking_signals
    }

    return json.dumps(ret)
예제 #2
0
파일: server.py 프로젝트: JBaba/cosr-back
def debug_url():
    """ API route for URL debug """

    # TODO: have a quota per ip on this API to prevent abuse

    url = request.args.get("url")

    # Special case for local files
    if url.startswith("tests/") and config["ENV"] == "local":
        with open(url, "rb") as f:
            cnt = f.read()
            headers = {}
    else:

        if not url.startswith("http"):
            url = "http://" + url

        req = requests.get(url)
        cnt = req.content
        headers = dict(req.headers)

    doc = load_document_type("html", cnt, url=str(url), headers=headers)

    parsed = indexer.parse_document(doc)

    global_rank, ranking_signals = indexer.ranker.get_global_document_rank(doc, parsed["url_metadata"])

    # URL class is not serializable
    links = [{
        "href": link["href"].url,
        "words": link.get("words")
    } for link in doc.get_hyperlinks()]

    ret = {
        "url": parsed["url"].url,
        "word_groups": doc.get_word_groups(),
        "rank": global_rank,
        "title_raw": doc.get_title(),
        "title": parsed["title_formatted"],
        "summary": parsed["summary_formatted"],
        "langs": parsed["langs"],
        "links": links,
        "ranking_signals": ranking_signals
    }

    return json.dumps(ret)
예제 #3
0
    def iter_documents(self, partition):
        """ Yields *Document objects from one partition. This is the main public method for Sources """

        i = 0
        maxdocs = int(self.args.get("maxdocs") or 0)

        for url, headers, document_type, index_level, body in self.iter_items(
                partition):

            document = load_document_type(document_type,
                                          body,
                                          url=url,
                                          headers=headers,
                                          index_level=index_level)

            yield document

            i += 1
            if i >= maxdocs > 0:
                return
예제 #4
0
    def iter_documents(self):
        """ Yields *Document objects. This is the main public method for Sources """

        i = 0
        maxdocs = int(self.args.get("maxdocs") or 9999999999)

        for url, headers, document_type, index_level, body in self.iter_items():

            document = load_document_type(
                document_type,
                body,
                url=url,
                headers=headers,
                index_level=index_level
            )

            yield document

            i += 1
            if i > maxdocs:
                return