Python load_document_type 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: cosrlib.document

메소드/함수: load_document_type

hotexamples.com에서의 예제들: 4

Python load_document_type - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 cosrlib.document.load_document_type에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: server.py 프로젝트: x0rzkov/cosr-back

def debug_url():
    """ API route for URL debug """

    # TODO: have a quota per ip on this API to prevent abuse

    url = request.args.get("url")

    # Special case for local files
    if url.startswith("tests/") and config["ENV"] == "local":
        with open(url, "rb") as f:
            cnt = f.read()
            headers = {}
    else:

        if not url.startswith("http"):
            url = "http://" + url

        req = requests.get(url)
        cnt = req.content
        headers = dict(req.headers)

    doc = load_document_type("html", cnt, url=str(url), headers=headers)

    parsed = indexer.parse_document(doc)

    global_rank, ranking_signals = indexer.ranker.get_global_document_rank(
        doc, parsed["url_metadata"])

    # URL class is not serializable
    links = [{
        "href": link["href"].url,
        "words": link.get("words")
    } for link in doc.get_hyperlinks()]

    ret = {
        "url": parsed["url"].url,
        "word_groups": doc.get_word_groups(),
        "rank": global_rank,
        "title_raw": doc.get_title(),
        "title": parsed["title_formatted"],
        "summary": parsed["summary_formatted"],
        "langs": parsed["langs"],
        "links": links,
        "ranking_signals": ranking_signals
    }

    return json.dumps(ret)

예제 #2

파일 보기

파일: server.py 프로젝트: JBaba/cosr-back

def debug_url():
    """ API route for URL debug """

    # TODO: have a quota per ip on this API to prevent abuse

    url = request.args.get("url")

    # Special case for local files
    if url.startswith("tests/") and config["ENV"] == "local":
        with open(url, "rb") as f:
            cnt = f.read()
            headers = {}
    else:

        if not url.startswith("http"):
            url = "http://" + url

        req = requests.get(url)
        cnt = req.content
        headers = dict(req.headers)

    doc = load_document_type("html", cnt, url=str(url), headers=headers)

    parsed = indexer.parse_document(doc)

    global_rank, ranking_signals = indexer.ranker.get_global_document_rank(doc, parsed["url_metadata"])

    # URL class is not serializable
    links = [{
        "href": link["href"].url,
        "words": link.get("words")
    } for link in doc.get_hyperlinks()]

    ret = {
        "url": parsed["url"].url,
        "word_groups": doc.get_word_groups(),
        "rank": global_rank,
        "title_raw": doc.get_title(),
        "title": parsed["title_formatted"],
        "summary": parsed["summary_formatted"],
        "langs": parsed["langs"],
        "links": links,
        "ranking_signals": ranking_signals
    }

    return json.dumps(ret)

예제 #3

파일 보기

파일: __init__.py 프로젝트: x0rzkov/cosr-back

    def iter_documents(self, partition):
        """ Yields *Document objects from one partition. This is the main public method for Sources """

        i = 0
        maxdocs = int(self.args.get("maxdocs") or 0)

        for url, headers, document_type, index_level, body in self.iter_items(
                partition):

            document = load_document_type(document_type,
                                          body,
                                          url=url,
                                          headers=headers,
                                          index_level=index_level)

            yield document

            i += 1
            if i >= maxdocs > 0:
                return

예제 #4

파일 보기

파일: __init__.py 프로젝트: jhildreth/cosr-back

    def iter_documents(self):
        """ Yields *Document objects. This is the main public method for Sources """

        i = 0
        maxdocs = int(self.args.get("maxdocs") or 9999999999)

        for url, headers, document_type, index_level, body in self.iter_items():

            document = load_document_type(
                document_type,
                body,
                url=url,
                headers=headers,
                index_level=index_level
            )

            yield document

            i += 1
            if i > maxdocs:
                return