Python get_html_from_raw_text示例

编程语言: Python

命名空间/包名称: cl.corpus_importer.dup_helpers

方法/功能: get_html_from_raw_text

hotexamples.com的示例: 3

Python get_html_from_raw_text - 已找到3个示例。这些是从开源项目中提取的最受好评的cl.corpus_importer.dup_helpers.get_html_from_raw_text现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

def import_law_box_case(case_path, i):
    raw_text = open(case_path).read()
    clean_html_tree, complete_html_tree, clean_html_str, body_text = get_html_from_raw_text(raw_text)
    citations = get_citations_from_tree(complete_html_tree, case_path)
    if not citations:
        print "******** F: %s ********" % case_path

    """

示例#2

显示文件

文件： import_law_box.py 项目： zayedmohamed/courtlistener

def import_law_box_case(case_path):
    """Open the file, get its contents, convert to XML and extract the meta data.

    Return a document object for saving in the database
    """
    raw_text = open(case_path).read()
    clean_html_tree, complete_html_tree, clean_html_str, body_text = get_html_from_raw_text(
        raw_text)

    sha1 = hashlib.sha1(clean_html_str).hexdigest()
    citations = get_citations_from_tree(complete_html_tree, case_path)
    judges = get_judge(clean_html_tree, case_path)
    court = get_court_object(clean_html_tree, citations, case_path, judges)

    doc = Document(
        source='L',
        sha1=sha1,
        html=clean_html_str,
        # we clear this field later, putting the value into html_lawbox.
        date_filed=get_date_filed(clean_html_tree,
                                  citations=citations,
                                  case_path=case_path,
                                  court=court),
        precedential_status=get_precedential_status(),
        judges=judges,
        download_url=case_path,
    )

    docket = Docket(
        docket_number=get_docket_number(clean_html_tree,
                                        case_path=case_path,
                                        court=court),
        case_name=get_case_name(complete_html_tree, case_path),
        court=court,
    )

    # Necessary for dup_finder.
    path = '//p/text()'
    doc.body_text = ' '.join(clean_html_tree.xpath(path))

    # Add the dict of citations to the object as its attributes.
    citations_as_dict = map_citations_to_models(citations)
    for k, v in citations_as_dict.items():
        setattr(doc, k, v)

    doc.docket = docket

    return doc

示例#3

显示文件

文件： import_law_box.py 项目： Andr3iC/courtlistener

def import_law_box_case(case_path):
    """Open the file, get its contents, convert to XML and extract the meta data.

    Return a document object for saving in the database
    """
    raw_text = open(case_path).read()
    clean_html_tree, complete_html_tree, clean_html_str, body_text = get_html_from_raw_text(
        raw_text)

    sha1 = hashlib.sha1(clean_html_str).hexdigest()
    citations = get_citations_from_tree(complete_html_tree, case_path)
    judges = get_judge(clean_html_tree, case_path)
    court = get_court_object(clean_html_tree, citations, case_path, judges)

    doc = Document(
        source='L',
        sha1=sha1,
        html=clean_html_str,
        # we clear this field later, putting the value into html_lawbox.
        date_filed=get_date_filed(clean_html_tree, citations=citations,
                                  case_path=case_path, court=court),
        precedential_status=get_precedential_status(),
        judges=judges,
        download_url=case_path,
    )

    docket = Docket(
        docket_number=get_docket_number(
            clean_html_tree,
            case_path=case_path,
            court=court
        ),
        case_name=get_case_name(complete_html_tree, case_path),
        court=court,
    )

    # Necessary for dup_finder.
    path = '//p/text()'
    doc.body_text = ' '.join(clean_html_tree.xpath(path))

    # Add the dict of citations to the object as its attributes.
    citations_as_dict = map_citations_to_models(citations)
    for k, v in citations_as_dict.items():
        setattr(doc, k, v)

    doc.docket = docket

    return doc